Template developed with materials from https://hbctraining.github.io/main/.
Code
# NOTE: This code will check version, this is our recommendation, it may work on other versions# Requires R version 4stopifnot(R.version$major >=4)# If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higherif (compareVersion(R.version$minor, "3.1") <0) warning("We recommend >= R4.3.1")# Requires BiocManager version 3.18 or laterstopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >=0)
This code is in this revision.
Code
library(tidyverse)# 1. Set up input files in this R file (params_de.R)source(params$params_file)# 2. Set up project file (already done from QC probably)source(params$project_file)# 3. Load custom functions to load data from coldata/metrics/countsmap(list.files(params$functions_file, pattern ="*.R$", full.names = T), source) %>%invisible()# Assign the genome parameter to the object genome# IMPORTANT set these values if you are not using the parameters in the header (lines 23-44)genome <- params$genome# If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warningif (grepl("hg|human", genome, ignore.case =TRUE)) {library(org.Hs.eg.db) ann.org <- org.Hs.eg.db} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) {library(org.Mm.eg.db) ann.org <- org.Mm.eg.db} else {warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.")}# Assigns the column object from the parameters in the YAML header to the column objectcolumn <- params$column# Assigns the contrasts object from the parameters in the YAML header to the contrasts objectcontrasts <- params$contrasts# Assigns the subset_column object from the parameters in the YAML header to the subset_column objectsubset_column <- params$subset_column# Assigns the subset_value object from the parameters in the YAML header to the subset_value objectsubset_value <- params$subset_value# Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv objectrun_ruv <- params$ruv# Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq objectrun_combatseq <- params$combatseq# If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSErun_rmv <- run_ruv | run_combatseq# Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of columnfactor_of_interest <- column
Code
# Load required librarieslibrary(knitr)library(tidyverse)library(glue)library(janitor)library(purrr)library(ggpubr)library(rtracklayer)library(DESeq2)library(vegan)library(EnhancedVolcano)library(DEGreport)library(pheatmap)library(msigdbr)library(fgsea)library(htmltools)library(ggforce)library(RColorBrewer)library(ggprism)library(ggprism)library(grafify)library(viridis)library(ashr)# Overwrite the default filter function with dplyr's filter functionfilter <- dplyr::filter# Overwrite the default select function with dplyr's select functionselect <- dplyr::select# Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12ggplot2::theme_set(theme_prism(base_size =12))# Setting scale color palette to the palette "kelly" from grafify# You can select different palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlscale_colour_discrete <-function(...) {scale_colour_manual(...,values =as.vector(grafify:::graf_palettes[["kelly"]]) )}# Setting scale fill palette to the palette "kelly" from grafify# You can select different palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlscale_fill_discrete <-function(...) {scale_fill_manual(...,values =as.vector(grafify:::graf_palettes[["kelly"]]) )}# Setting color to the palette "kelly" from grafify# You can select diffferent palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlcolors <-as.vector(grafify:::graf_palettes[["kelly"]])# Setting options for Rmd code chunksopts_chunk[["set"]](cache =FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again.cache.lazy =FALSE, # Whether to only load cached objects when neededdev =c("png", "pdf"), # Specifies the graphical devices to use figureserror =TRUE, # Whether to stop rendering on an errorhighlight =TRUE, # Whether to enable syntax highlighting for the code in the chunkmessage =FALSE, # Whether to preserve messages emitted by message() in the final outputprompt =FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documentstidy =FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, stylerwarning =FALSE, # Whether to preserve warnings in the outputecho =TRUE, # Whether to include R source code in the final knitted documentfig.height =4) # Sets the default height of plot figures generated in inches# Set seed for reproducibilityset.seed(1234567890L)
Code
# Create a function to clean up data framessanitize_datatable <-function(df, ...) {# Remove dashes from row names and column names which cause wrapping DT::datatable(df, ...,rownames =gsub("-", "_", rownames(df)),colnames =gsub("-", "_", colnames(df)),options =list(scrollX =TRUE, autoWidth =TRUE),class ="stripe hover" )}
Code
# This code will load from nf-core folder# NOTE make sure to set numerator and denominator# Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory# The factors for this subsetting were set in thr params section of the YAML code chunk.# The default subsetting is none since subset_column and subset_value are both set to null initially# The column object here is given to ensure that the column of interest exists in the loaded dataframecoldata <-load_coldata( coldata_fn, column, subset_column, subset_value)# Set the row names from the coldata dataframe to be a column called sample# This column should already exist from the Nextflow runcoldata$sample <-row.names(coldata)# Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directorycounts <-load_counts(counts_fn)# If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframecounts <- counts[, colnames(counts) %in% coldata$sample]# Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate).metrics <-load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%left_join(coldata, by =c("sample")) %>%# Join the coldata table to the end of object produced from the load_metrics functionas.data.frame() # Format it as a dataframe# Assign the sample names to be the row names in the metrics objectrownames(metrics) <- metrics$sample# If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframemetrics <-subset(metrics, metrics$sample %in% coldata$sample)# If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts objectcounts <- counts[, rownames(metrics)]# If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts objectcoldata <- coldata[rownames(metrics), ]# This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAMLcoldata[[contrasts[[1]][1]]] <-relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3])# Stop if the column names in counts does not match and in the same order as the row names in metricsstopifnot(all(colnames(counts) ==rownames(metrics)))
1 Overview
Project: name_XXXXX
PI: person name
Analyst: person in the core
Experiment: short description
Aim: short description
Code
# Query AnnotationHub and assign it to ah objectah <- AnnotationHub::AnnotationHub()# Retrieve the annotations from Annotation Hub dependent on your reference genomeif (grepl("hg|human", genome, ignore.case =TRUE)) { # If the reference genome is human ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations ens <- ens[["AH119325"]] # Assign a human annotation, may need to change} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) { # If the reference genome is mouse ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change} else { # If the reference genome is not human or mouse, prin this warningcat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.")}# Create a gene-level dataframe# NOTE: Change this code if you are not using AnnotationHub# The rdata is a table with gene_id, gene_name and entrez columnsrdata <- ensembldb::genes(ens, return.type ="data.frame") %>%# Extract ens annotation as a dataframe dplyr::select(gene_id, gene_name,entrez = entrezid, gene_biotype, description ) %>%# Retrieve certain columns and rename the entrez column dplyr::filter(gene_id %in%rownames(counts)) %>%# Retain only gene annotations in the counts matrix dplyr::distinct(gene_id, .keep_all =TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE# Keep one entrez Id per generdata$entrez <- purrr::map(rdata$entrez, 1) %>%unlist()# Filter not needed genes# rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs# filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs# filter(!grepl('Snor', gene_name)) # Remove snoRNAs# counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes
2 Set up
We recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as:
A high proportion of dropouts (zero counts), where filtering can reduce computational burden
A large number of samples, where pre-filtering low-expressed features can improve performance
Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity
Code
# Establish a null DDS object for DESeq2dds_to_use <-DESeqDataSetFromMatrix(counts, coldata, design =~1)# Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_beforevsd_before <-vst(dds_to_use)# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrixnorm_matrix <-assay(vsd_before)
3 PCA and group level variance.
Principal Component Analysis (PCA) is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings.
Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.
It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.
This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.
Code
# Run PCA on the normalized countspca <-degPCA(norm_matrix,metadata = metrics, # Using the metrics object as our metadatacondition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunkname ="sample", # Column from metadata to use to label the points in the PCA plotdata =TRUE# Include the data)pca$plot +# Create plotggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) +# Add title to plottheme(plot.title =element_text(hjust =0.5)) +# Center title on plotgeom_mark_ellipse(aes(color = .data[[column]])) +# Create ellipses on plotscale_color_grafify("kelly") # Set color parameter for plot
3.1 Analysis of the variance by group
Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. Source
Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the betadisper() function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the anova() function and the second from the permutest() function. We also get pairwise p-values for every group-group comparison.
This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate. Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates.
Code
# Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix# Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samplesvare.disa <-vegdist(t(assay(vsd_before)))# Evaluate the homogeneity of dispersion within the groupsmod <-betadisper(vare.disa, group = metrics[[factor_of_interest]])# Test the homogeniety of dispersion between groups with ANOVA.# ANOVA tests is there is overall significant variation in variance between groups.anova(mod)
Code
# Test the homogeniety of dispersion between groups with a permutation test# The permutation test considers pairwise comparisons so you can see which group(s) is differentpermutest(mod, pairwise =TRUE)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 719
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 1.2238e-05 1.2238e-05 0.2501 719 0.7014
Residuals 4 1.9573e-04 4.8932e-05
Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
normal tumor
normal 0.6
tumor 0.64326
4 Covariate analysis
Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.
Here, we are using DEGreport::degCovariates() to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. This diagnostic plot helps us determine which variables we may need to add to our DE model.
Code
# Visualize covariates for the PCsdegCovariates(counts = norm_matrix, # Normalized counts matrixmetadata = metrics # Metadata to overlay as covariates)
5 Data modeling
Code
# Enter your design formula# The format should be:# as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest))formula <-as.formula(paste0("~ ", " + ", column))# Stop if the column names in counts does not match and in the same order as the row names in metricsstopifnot(all(colnames(counts) ==rownames(coldata)))# Create the DDS object for DESeq2 and assign it to the object dds_to_usedds_to_use <-DESeqDataSetFromMatrix(countData = counts, # Provide the raw counts datacolData = coldata, # Provide the metadatadesign = formula # Provide the design formula)# Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before# Note: vsd_before and norm_matrix was previously assigned in the setup code chunk w/o variables in the formula so this is overwriting that# NOTE: VST won’t regress out this when normalizingvsd_before <-vst(dds_to_use)# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrixnorm_matrix <-assay(vsd_before)# We are going to modify coldata in the RUV section when is turn on, hence want to retain the originalnew_cdata <- coldata
For this study, this formula is recommended: ~, +sample_type
5.1 Remove unwanted variation
Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation.
5.1.1 Assessing unknown factors
Code
# Print out text discussing is RUVseq will be done# If the RUV parameter in the params section of the YAML is TRUE, thenif (run_ruv) {cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")} else { # If the RUV parameter in the params section of the YAML is FALSE, thencat("There is no need to assess unknown factor for this study.")}
There is no need to assess unknown factor for this study.
There is no need to remove known factors like batch effect in this study.
Code
# Run ComBat-seqadjusted_counts <-ComBat_seq(counts =as.matrix(counts), # Provide the counts matrixbatch = batch, # Provide the batch informationgroup = treatment # Provide the variable of interest information)# NOTE: Running ComBat-seq with multiple variables of interest# adjusted_counts <- ComBat_seq(# counts = as.matrix(counts), # Provide the counts matrix# batch = batch, # Provide the batch information# covar_mod = imp # Provide the variables of interest information# )
Code
# Re-enter the adjusted count from ComBat-seq into DESeq# NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq abovedds_to_use <-DESeqDataSetFromMatrix(countData = adjusted_counts, # Provide the adjusted counts from ComBat-seqcolData = coldata, # Provide coldatadesign = formula # Provide design formula, notably without the covariates used in ComBat-seq)# Perform a variance-stabilized transformation on the ComBat-seq adjusted count datavsd_combat <-vst(dds_to_use, blind =FALSE)# Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrixnorm_matrix <-assay(vsd_combat)# Run degPCA to use prcomp and make a plotpca_combat <-degPCA(counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seqmetadata = coldata, # Provide metadatacondition = column # Provide the variable of interest to color the plot by) +ggtitle("After Combatseq") # Add title# Plot PCA from dgePCApca_combat +scale_color_grafify("kelly") # Apply color scale
6 Differential Expression
Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model.
Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then ‘shrunken’ to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit.
We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.
Code
# Perform differential expression analysis on the DDS objectde <-DESeq(dds_to_use)# Plot the dispersion for the dataDESeq2::plotDispEsts(de)
Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.
In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions.
The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.
Code
# NOTE: We recommend LRT for time series# resultsNames(de) # Check the order is right# Create a list that mirrors the contrasts in the contrasts list object where the pattern is:# <column_of_comparison>_<treatment_name>_vs_<control_name>names_to_use <-lapply(contrasts, function(contrast) { coef <-paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])})# Currently the contrasts list object doesn't have names for the items in the list# Assign the names from names_to_use to the names in the contrasts list objectnames(contrasts) <- names_to_use# Create a list called `de_list` to hold the differential expression information for each contrast# Go through each contrast in the contrasts list object andde_list <-lapply(contrasts, function(contrast) {# Extract the results (LFC, pvalue, padj, etc.) for the contrast resLFC <-results(de, contrast = contrast)# Create a string for the contrast following this pattern:# <column_of_comparison>_<treatment_name>_vs_<control_name> coef <-paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])# Apply LFC shrinkage with `ashr`# NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less. resLFCS <-lfcShrink(dds = de, # Provide the DDS objectcontrast = contrast, # Provide the contrasttype ="ash"# Use `ashr` to apply the shrinkage )# If you want to use `apeglm` instead of `ashr` then use:# resLFCS <- lfcShrink(# dds = de, # Provide the DDS object# coef = coef, # Provide the contrast# type = "apeglm" # Use `apeglm` to apply the shrinkage# )# Create a new results data frame with some extra information res <-as.data.frame(resLFCS) %>%# Extract the results from the resLFCS as a dataframerownames_to_column("gene_id") %>%# Move the rownames to a column of data called "gene_id"left_join(rdata, by ="gene_id") %>%# Combine the results with the rdata object by the gene_id and rdata holds information about each generelocate(gene_name) %>%# Move the "gene_name" column to be the first column dplyr::rename(lfc = log2FoldChange) %>%# Rename the column log2FoldChange to lfcmutate(pi =abs(lfc) *-log10(padj)) %>%# Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-valuearrange(-pi) # Arrange the data frame by the highest values of pi# Filter out genes that have no expression or were filtered out by DESEQ2 res <- res[res$baseMean >0, ] %>%# Remove genes with no expressiondrop_na(padj) %>%# Remove genes who don't have an adjusted p-value (lots of lowly expressed genes)drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)# Extract the significant genes res_sig <- res %>%filter(padj <0.05) %>%# Retain the values that have an adjusted p-value less than 0.05arrange(padj) %>%# Arrange the genes by the lowest adjusted p-valuemutate(gene_name =ifelse(test = gene_name ==""|is.na(gene_name), # If the gene_name value for a gene is NA or blank thenyes = gene_id, # Use the gene_id as the gene_nameno = gene_name # Maintain the gene_name ))# Create a list object called results results <-list(lfc = resLFC, # Assign the unshrunken results data to `lfc`lfcs = resLFCS, # Assign the shrunken results data to `lfcs`all = res, # Assign the full differential expression results to `all`sig = res_sig # Assign the significantly differentially expressed genes to `sig` )# Return the `results` list object to be a list within the object `de_list`return(results)})# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:# de_list <- c(de_list, new_comparison=list(lfc=resLFC,# lfcs=resLFCS,# all=res, sig=res_sig))
6.1 MA plot
This plot can help to:
Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions.
Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions.
Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset.
This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen.
From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment.
Code
# Call the appropriate functional analysis databases for your analysisif (grepl("hg|human", genome, ignore.case =TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then all_in_life <-get_databases_v2(sps ="human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <-TRUE# Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then all_in_life <-get_databases_v2(sps ="mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <-TRUE# Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run} else { # If your organism isn't mouse or human thenwarning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message run_FA <-FALSE# Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run}
8 Pathway Analysis- GSEA
Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA.
Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes.
Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge.
Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes.
In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.
Code
# Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the listfa_gsea_list <-lapply(de_list, function(contrast) {# For a given contrast pull out the full differential expression results res <- contrast[["all"]]# Creating the input for the GSEA analysis gsea_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj)) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc"# Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- AnnotationDbi::select(x = ann.org, # Annotation database to searchkeys = gsea_input$gene_id, # The gene_ids to search forkeytype ="ENSEMBL", # The gene_ids (keys) are ENSEMBL annotationscolumns =c("ENTREZID", "SYMBOL") ) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol# Combine gene annotation with the lfc information input_entrezid <-inner_join(x = gsea_input, # Combine the ENSEMBL ID and lfc input withy = input_entrezid, # The ENTREZ ID and gene symbol informationby =c("gene_id"="ENSEMBL") ) %>%# Join these table together by common ENSEMBL IDsfilter(!is.na(ENTREZID)) %>%# Remove the genes without an ENTREZ IDdistinct(ENTREZID, .keep_all =TRUE) %>%# Retain the unique ENTREZ IDs and all columns of dataarrange(desc(lfc)) # Arrange the data frame by descending values of lfc# Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb tb <-run_fgsea_v2(input = input_entrezid, # Query for the GSEA analysisall_in_life = all_in_life ) # Annotation databases to query against tb %>%filter(padj <0.05) %>%arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them})
Code
# NOTE: DT::datatables doesn't work with tabset and for loops# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Assign the significantly enriched pathways to the object res_sig res_sig <- fa_gsea_list[[contrast]] %>%# Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05.filter(padj <0.05) %>%arrange(padj) # Filter and sort for adjusted p-values of less than 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways )}# Render the dt_list into HTMLtagList(dt_list)
sample_type_tumor_vs_normal
sample_type_normal_vs_tumor
9 Pathway Analysis- Over-representation
Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA:
Simplicity: Easy to perform and interpret.
Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
# Go through each contrast in the de_list list object andfa_list <-lapply(de_list, function(contrast) {# For a given contrast pull out the full differential expression results res <- contrast[["all"]]# Extract the universe of genes to the ORA on and assign it to the object universe universe <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj)) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list objectpull(gene_id) # Extract the gene_id column# Subset the full annotations to only contain universe we will evaluate within universe_mapping <- rdata %>%# Start with the full annotationsfilter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID# AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL'))# Create the input vector of genes for the ORA with a positive or negative lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, abs(lfc) >0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID# AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory all <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create the input vector of genes for the ORA with a positive lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, lfc >0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is greater than 0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory up <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create the input vector of genes for the ORA with a negative lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, lfc <-0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is less than -0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory down <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create a list to hold the ORA results for a given contrastlist(all = all, # Assign the positive and negative lfc combined ORA results to "all"up = up, # Assign the positive lfc combined ORA results to "up"down = down # Assign the negative lfc combined ORA results to "down" )})
9.1 All significant genes
Code
# NOTE DT::datatables doesn't work with tabset and for loops# You can use the following code to print dynamically or call manually sanitize_datatable() multiple timesdt_list <-list()for (contrast innames(de_list)) { res_sig <- fa_list[[contrast]][["all"]] %>%filter(padj <0.05) %>%arrange(padj) # Filter and sort dt_list <-c( dt_list,list(h3(contrast)),list(sanitize_datatable(res_sig)) )}tagList(dt_list)
sample_type_tumor_vs_normal
sample_type_normal_vs_tumor
9.2 Down-regulated genes
Code
# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc res_sig <- fa_list[[contrast]][["down"]] %>%# Start with the complete list of terms used in the ORA evaluated for negative lfcfilter(padj <0.05) %>%arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a negative lfc )}# Render the dt_list into HTML for the significantly over-represented terms for genes with a negative lfctagList(dt_list)
sample_type_tumor_vs_normal
sample_type_normal_vs_tumor
9.3 Up-regulated genes
Code
# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc res_sig <- fa_list[[contrast]][["up"]] %>%# Start with the complete list of terms used in the ORA evaluated for positive lfcfilter(padj <0.05) %>%arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a positive lfc )}# Render the dt_list into HTML for the significantly over-represented terms for genes with a positive lfctagList(dt_list)
sample_type_tumor_vs_normal
sample_type_normal_vs_tumor
10 Save files
Code
# If the subset value from the params in the YAML is not null thenif (!is.null(subset_value) &!is.null(subset_column)) {# Interpret the value of subset_value and assign it to the object called filenames filenames <-str_interp("${subset_value}")} else { # If the subset value from the params in the YAML is null then# Assign the calue of "full" to the object called filenames filenames <-"full"}# Assign the value of filenames to a new object called filenamefilename <-paste0(filenames)# Assign the file path to save the file path for the expression data CSV to the object name_expression_fnname_expression_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv")# Extract the variance stabilized transformed counts matrix into a data frame to be savedcounts_norm <- norm_matrix %>%# Start with the variance stabilized transformed counts matrixas.data.frame() %>%# Convert the matrix to a data frame# Convert the row names (ENSEMBL IDs) to a column called "gene_id"rownames_to_column("gene_id") %>%left_join(rdata, by ="gene_id")# Write the variance stabilized transformed counts matrix to a CSV file with the name_expression_fn pathwrite_csv(counts_norm, name_expression_fn)# For each of the contrasts in the de_list list object to save the differential expression analysis and ORA to their respective CSV filesfor (contrast innames(contrasts)) {# Create a file name for the differential expression analysis name_deg_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv" )# Create a file name for the ORA analysis name_pathways_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv" )# Assign the full differential expression analysis for a given contrast to the object called res_for_writing res_for_writing <- de_list[[contrast]][["all"]] %>%# Start with the full differential expression analysis dataframe for the given contrastmutate(comparison = contrast) # Add a column to the table with the given contrast# Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing# NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code pathways_for_writing <- fa_list[[contrast]][["all"]] %>%# Start with the full ORA dataframe evaluating negative and positive lfc for the given contrastmutate(comparison = contrast) # Add a column to the table with the given contrast# Write the full differential expression analysis for a given contrast to a CSV file with the name_deg_fn pathwrite_csv(res_for_writing, name_deg_fn)# Write the full ORA for a given contrast to a CSV file with the name_pathways_fn pathwrite_csv(pathways_for_writing, name_pathways_fn)}
If you would like to use DEGpatterns.Rmd template to visualize trends of gene changes across conditions/groups, please save the following output:
Code
# Save the DESeq object to an RDS filesaveRDS(dds_to_use, "DEGpattern_deseq_obj.rds")# Save the metadata to an RDS filesaveRDS(coldata, "deseq_coldata.rds")# Set the adjusted p-value threshold to 0.05padj.cutoff <-0.05# Assign a value for the top n genes to XXXtopN <-1000# Go through each contrast in the de_list list object and extract the significance values for the significant genes for top N significant genes# This will go through the de_list object, assign the name of the contrast in the de_list to contrast and assign its associated differential expression list (containing lfc, lfcs, all and sig) to the object contrast_dedeg_List <-imap(de_list, \(contrast_de, contrast){# Extract the significant genes from a given contrast and assign them to res_LRT res_LRT <- contrast_de[["sig"]]# Create a named vector where the values in the vector are the adjusted p-values and the names are the index of the value gene_padj <-setNames(res_LRT$padj, rownames(res_LRT))# Subset the gene_padj vector by the adjusted p-value threshold established in the top of this code block gene_sig <- gene_padj[gene_padj < padj.cutoff]# Arrange the gene significance values with the most significant first (they should already be arranged this way though), then keep either the first N genes defined by topN at the top of this code block or the total number of significant genes, whichever is fewer and assign it to the object called degs degs <-sort(gene_sig)[1:min(length(gene_sig), topN)]# Save the RDS object for the significance values for a given contrast, adjusted p-value threshold and number of top N genessaveRDS(degs, glue("DEGpattern_deseq_{contrast}_DEGs_padj{padj.cutoff}_topN{topN}.rds"))# Populate the deg_List with the degs vector for each contrastreturn(degs)})# Please note, you can choose to combine all significant results across contrast as well
11 Conclusion
12 Methods
12.1 R package references
Code
# Provide citationscitation("DESeq2")
To cite package ‘DESeq2’ in publications use:
Love, M.I., Huber, W., Anders, S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 Genome Biology 15(12):550 (2014)
A BibTeX entry for LaTeX users is
@Article{, title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, author = {Michael I. Love and Wolfgang Huber and Simon Anders}, year = {2014}, journal = {Genome Biology}, doi = {10.1186/s13059-014-0550-8}, volume = {15}, issue = {12}, pages = {550}, }
Code
citation("ggplot2")
To cite ggplot2 in publications, please use
H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
A BibTeX entry for LaTeX users is
@Book{, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, publisher = {Springer-Verlag New York}, year = {2016}, isbn = {978-3-319-24277-4}, url = {https://ggplot2.tidyverse.org}, }
---title: "Differential Expression"author: "Harvard Chan Bioinformatics Core"date: "`r Sys.Date()`"format: html: code-fold: true code-tools: true code-overflow: wrap df-print: paged highlight-style: pygments number-sections: true self-contained: true theme: default toc: true toc-location: left toc-expand: false lightbox: true page-layout: fullparams: ## Combatseq and ruv can both be false or ONLY ONE can be true # column is the factor of interest in your data that you are trying to evaluate # For example, this might be "condition" or "treatment" column: "sample_type" # This is a list of contrasts that you would like to evaluate # The syntax for each item in the list is: c("column_name", "treatment_name", "control_name") contrasts: value: - ["sample_type", "tumor", "normal"] - ["sample_type", "normal", "tumor"] # These next two subset lines are useful if you would like to subset your data in someway before you run the analysis # The column of data in you would like to subset, perhaps: "cell_type" or "batch" subset_column: null # Then, which values for that subset_column would you like to keep, like "macrophages" or "batch_2" subset_value: null # Reference genome used genome: hg38 # If you would like to run RUV, set to TRUE, otherwise FALSE ruv: false # If you would like to run ComBatseq, set to TRUE, otherwise FALSE combatseq: false params_file: ../00_params/params-example.R # example data # Path to the information.R file project_file: ../information.R # Path to the directory with the functions functions_file: ../00_libs---Template developed with materials from https://hbctraining.github.io/main/.```{r libraries}# NOTE: This code will check version, this is our recommendation, it may work on other versions# Requires R version 4stopifnot(R.version$major >=4)# If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higherif (compareVersion(R.version$minor, "3.1") <0) warning("We recommend >= R4.3.1")# Requires BiocManager version 3.18 or laterstopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >=0)```This code is in this  revision.```{r load-params, cache = FALSE, message = FALSE, warning=FALSE}library(tidyverse)# 1. Set up input files in this R file (params_de.R)source(params$params_file)# 2. Set up project file (already done from QC probably)source(params$project_file)# 3. Load custom functions to load data from coldata/metrics/countsmap(list.files(params$functions_file, pattern ="*.R$", full.names = T), source) %>%invisible()# Assign the genome parameter to the object genome# IMPORTANT set these values if you are not using the parameters in the header (lines 23-44)genome <- params$genome# If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warningif (grepl("hg|human", genome, ignore.case =TRUE)) {library(org.Hs.eg.db) ann.org <- org.Hs.eg.db} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) {library(org.Mm.eg.db) ann.org <- org.Mm.eg.db} else {warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.")}# Assigns the column object from the parameters in the YAML header to the column objectcolumn <- params$column# Assigns the contrasts object from the parameters in the YAML header to the contrasts objectcontrasts <- params$contrasts# Assigns the subset_column object from the parameters in the YAML header to the subset_column objectsubset_column <- params$subset_column# Assigns the subset_value object from the parameters in the YAML header to the subset_value objectsubset_value <- params$subset_value# Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv objectrun_ruv <- params$ruv# Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq objectrun_combatseq <- params$combatseq# If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSErun_rmv <- run_ruv | run_combatseq# Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of columnfactor_of_interest <- column``````{r load-libraries, cache = FALSE, message = FALSE, warning=FALSE}# Load required librarieslibrary(knitr)library(tidyverse)library(glue)library(janitor)library(purrr)library(ggpubr)library(rtracklayer)library(DESeq2)library(vegan)library(EnhancedVolcano)library(DEGreport)library(pheatmap)library(msigdbr)library(fgsea)library(htmltools)library(ggforce)library(RColorBrewer)library(ggprism)library(ggprism)library(grafify)library(viridis)library(ashr)# Overwrite the default filter function with dplyr's filter functionfilter <- dplyr::filter# Overwrite the default select function with dplyr's select functionselect <- dplyr::select# Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12ggplot2::theme_set(theme_prism(base_size =12))# Setting scale color palette to the palette "kelly" from grafify# You can select different palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlscale_colour_discrete <-function(...) {scale_colour_manual(...,values =as.vector(grafify:::graf_palettes[["kelly"]]) )}# Setting scale fill palette to the palette "kelly" from grafify# You can select different palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlscale_fill_discrete <-function(...) {scale_fill_manual(...,values =as.vector(grafify:::graf_palettes[["kelly"]]) )}# Setting color to the palette "kelly" from grafify# You can select diffferent palettes from here:# https://grafify-vignettes.netlify.app/colour_palettes.htmlcolors <-as.vector(grafify:::graf_palettes[["kelly"]])# Setting options for Rmd code chunksopts_chunk[["set"]](cache =FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again.cache.lazy =FALSE, # Whether to only load cached objects when neededdev =c("png", "pdf"), # Specifies the graphical devices to use figureserror =TRUE, # Whether to stop rendering on an errorhighlight =TRUE, # Whether to enable syntax highlighting for the code in the chunkmessage =FALSE, # Whether to preserve messages emitted by message() in the final outputprompt =FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documentstidy =FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, stylerwarning =FALSE, # Whether to preserve warnings in the outputecho =TRUE, # Whether to include R source code in the final knitted documentfig.height =4) # Sets the default height of plot figures generated in inches# Set seed for reproducibilityset.seed(1234567890L)``````{r sanitize-datatable}# Create a function to clean up data framessanitize_datatable <-function(df, ...) {# Remove dashes from row names and column names which cause wrapping DT::datatable(df, ...,rownames =gsub("-", "_", rownames(df)),colnames =gsub("-", "_", colnames(df)),options =list(scrollX =TRUE, autoWidth =TRUE),class ="stripe hover" )}``````{r load-data, message=F, warning=F}# This code will load from nf-core folder# NOTE make sure to set numerator and denominator# Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory# The factors for this subsetting were set in thr params section of the YAML code chunk.# The default subsetting is none since subset_column and subset_value are both set to null initially# The column object here is given to ensure that the column of interest exists in the loaded dataframecoldata <-load_coldata( coldata_fn, column, subset_column, subset_value)# Set the row names from the coldata dataframe to be a column called sample# This column should already exist from the Nextflow runcoldata$sample <-row.names(coldata)# Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directorycounts <-load_counts(counts_fn)# If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframecounts <- counts[, colnames(counts) %in% coldata$sample]# Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate).metrics <-load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%left_join(coldata, by =c("sample")) %>%# Join the coldata table to the end of object produced from the load_metrics functionas.data.frame() # Format it as a dataframe# Assign the sample names to be the row names in the metrics objectrownames(metrics) <- metrics$sample# If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframemetrics <-subset(metrics, metrics$sample %in% coldata$sample)# If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts objectcounts <- counts[, rownames(metrics)]# If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts objectcoldata <- coldata[rownames(metrics), ]# This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAMLcoldata[[contrasts[[1]][1]]] <-relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3])# Stop if the column names in counts does not match and in the same order as the row names in metricsstopifnot(all(colnames(counts) ==rownames(metrics)))```# Overview- Project: `r project`- PI: `r PI`- Analyst: `r analyst`- Experiment: `r experiment`- Aim: `r aim````{r load-counts-data}# Query AnnotationHub and assign it to ah objectah <- AnnotationHub::AnnotationHub()# Retrieve the annotations from Annotation Hub dependent on your reference genomeif (grepl("hg|human", genome, ignore.case =TRUE)) { # If the reference genome is human ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations ens <- ens[["AH119325"]] # Assign a human annotation, may need to change} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) { # If the reference genome is mouse ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change} else { # If the reference genome is not human or mouse, prin this warningcat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.")}# Create a gene-level dataframe# NOTE: Change this code if you are not using AnnotationHub# The rdata is a table with gene_id, gene_name and entrez columnsrdata <- ensembldb::genes(ens, return.type ="data.frame") %>%# Extract ens annotation as a dataframe dplyr::select(gene_id, gene_name,entrez = entrezid, gene_biotype, description ) %>%# Retrieve certain columns and rename the entrez column dplyr::filter(gene_id %in%rownames(counts)) %>%# Retain only gene annotations in the counts matrix dplyr::distinct(gene_id, .keep_all =TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE# Keep one entrez Id per generdata$entrez <- purrr::map(rdata$entrez, 1) %>%unlist()# Filter not needed genes# rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs# filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs# filter(!grepl('Snor', gene_name)) # Remove snoRNAs# counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes```# Set upWe recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as:- A high proportion of dropouts (zero counts), where filtering can reduce computational burden- A large number of samples, where pre-filtering low-expressed features can improve performance- Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity```{r setup}# Establish a null DDS object for DESeq2dds_to_use <-DESeqDataSetFromMatrix(counts, coldata, design =~1)# Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_beforevsd_before <-vst(dds_to_use)# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrixnorm_matrix <-assay(vsd_before)```# PCA and group level variance.**Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings.Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.****This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.**```{r set-group, eval=FALSE, echo=FALSE}## Example of creating a group covariate# meta$group <- paste0(meta$sex, "_", meta$age, "_", meta$treatment)# Re-assign the factor of interest in case it has change# You can modify this to test for different factors of interest with:# factor_of_interest <- "insert column name for covariate of interest"factor_of_interest <- column``````{r plot-pca}# Run PCA on the normalized countspca <-degPCA(norm_matrix,metadata = metrics, # Using the metrics object as our metadatacondition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunkname ="sample", # Column from metadata to use to label the points in the PCA plotdata =TRUE# Include the data)pca$plot +# Create plotggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) +# Add title to plottheme(plot.title =element_text(hjust =0.5)) +# Center title on plotgeom_mark_ellipse(aes(color = .data[[column]])) +# Create ellipses on plotscale_color_grafify("kelly") # Set color parameter for plot```## Analysis of the variance by groupGroups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/)Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison.**This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate.** Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates.```{r PERMDISP}# Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix# Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samplesvare.disa <-vegdist(t(assay(vsd_before)))# Evaluate the homogeneity of dispersion within the groupsmod <-betadisper(vare.disa, group = metrics[[factor_of_interest]])# Test the homogeniety of dispersion between groups with ANOVA.# ANOVA tests is there is overall significant variation in variance between groups.anova(mod)# Test the homogeniety of dispersion between groups with a permutation test# The permutation test considers pairwise comparisons so you can see which group(s) is differentpermutest(mod, pairwise =TRUE)```# Covariate analysisMultiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.**```{r run-covariates, fig.height = 6, fig.width = 10}# Visualize covariates for the PCsdegCovariates(counts = norm_matrix, # Normalized counts matrixmetadata = metrics # Metadata to overlay as covariates)```# Data modeling```{r setup-deseq2}# Enter your design formula# The format should be:# as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest))formula <-as.formula(paste0("~ ", " + ", column))# Stop if the column names in counts does not match and in the same order as the row names in metricsstopifnot(all(colnames(counts) ==rownames(coldata)))# Create the DDS object for DESeq2 and assign it to the object dds_to_usedds_to_use <-DESeqDataSetFromMatrix(countData = counts, # Provide the raw counts datacolData = coldata, # Provide the metadatadesign = formula # Provide the design formula)# Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before# Note: vsd_before and norm_matrix was previously assigned in the setup code chunk w/o variables in the formula so this is overwriting that# NOTE: VST won’t regress out this when normalizingvsd_before <-vst(dds_to_use)# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrixnorm_matrix <-assay(vsd_before)# We are going to modify coldata in the RUV section when is turn on, hence want to retain the originalnew_cdata <- coldata```For this study, this formula is recommended: `r as.character(formula)````{r note-ruv, eval=F, echo=FALSE}#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section```## Remove unwanted variationRemoving unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation.### Assessing unknown factors```{r note-ruv-print, results='asis'}# Print out text discussing is RUVseq will be done# If the RUV parameter in the params section of the YAML is TRUE, thenif (run_ruv) {cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")} else { # If the RUV parameter in the params section of the YAML is FALSE, thencat("There is no need to assess unknown factor for this study.")}``````{r do_RUV, eval=run_ruv, echo=run_ruv}# TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata# NOTE: RUVseq is used when you don’t know where the unwanted variation is coming from. The package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation. Add any known-created RUV variables to DESeq2 formula. The normalized matrix produced – only for visualization, not for input into DESeq2.library(RUVSeq)# If you want to skip the code, just set up formula to be your model in the next chunk of code# Vector containing the variable of interestdesign <- coldata[[column]]# Formats the variable of interest into a matrix# Levels are the rows and columns are the number of elements of a given level# The value is the index of the positions in the vectordiffs <-makeGroups(design)# Re-assigning the variance stabilized transformation count matrix to a object called datdat <- norm_matrix# By default RUVSeq is running one variable,ruvset_2 <-RUVs(x = dat, # Variance stabilized transformed counts datacIdx =rownames(dat), # Vector of the genesk =1, # Number of unknown covariates to findscIdx = diffs, # Matrix formatted variable of interestisLog =TRUE, # The variance stabilized transformation is approximately a log transformationround =FALSE# Whether to round the normalized measures to form pseudocounts)# W is the estimation of the unknown source of variation that we can now add to DESeq2vars <- ruvset$W# Create a new colData with W added to itnew_cdata <-cbind(coldata, vars)# New formula with Wformula <-as.formula(paste0("~ ",paste0(colnames(new_cdata)[grepl("W", colnames(new_cdata))],collapse =" + " ), " + ", column))# Overwrite the normalized count matrix with the normalized counts from RUVSeq# NOTE: Only use this for visualizationnorm_matrix <- ruvset$normalizedCounts# Create a PCApca2 <-degPCA(counts = norm_matrix, # Provide the adjusted counts from RUVSeqmetadata = new_cdata, # Provide metadatacondition = column # Provide the variable of interest to color the plot by) +ggtitle("After RUV") # Add a title# Plot PCApca2 +scale_color_grafify("kelly") # Apply color scale``````{r after_RUV, eval=run_ruv}# Make DDS objectdds_to_use <-DESeqDataSetFromMatrix(counts, new_cdata, design = formula)# Apply VSTvsd_to_use <-vst(dds_to_use, blind =FALSE)```### Remove Batch Effects```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq}# NOTE Combatseq (part of the SVA package) - corrected count, removes the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add known-removed covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation.library(sva)cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest.Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).")``````{r print-no-ruv-or-batch, eval=!run_combatseq, results='asis', echo=run_combatseq}cat("There is no need to remove known factors like batch effect in this study.")``````{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq}# NOTE: work on this code if you need to run ComBat-seq# Set your batch effect variable here this is the variable that ComBat-seq will try to remove## Column name of your batch variableto_remove <-"batch"## Column name of of your variable(s) of interestto_keep <-"sample_type"# Re-assign the column of coldata with the batch information as a factorcoldata[[to_remove]] <-as.factor(coldata[[to_remove]])# Re-assign the column of coldata with the variable of interest as a factorcoldata[[to_keep]] <-as.factor(coldata[[to_keep]])# Assign the column with batch information to the batch objectbatch <- coldata[[to_remove]]# Assign the column with variable of interest information to the treatment objecttreatment <- coldata[[to_keep]]# If you have multiple variables of interest you will need to assign each to a vector# treatment1 <- metrics[[to_keep]]# treatment2 <- metrics[[to_keep]]# treatment3 <- metrics[[to_keep]]# Column bind all of the vectors with columns containing variables of interest into one object# imp <- cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3)))``````{r do_combatseq, eval=run_combatseq}# Run ComBat-seqadjusted_counts <-ComBat_seq(counts =as.matrix(counts), # Provide the counts matrixbatch = batch, # Provide the batch informationgroup = treatment # Provide the variable of interest information)# NOTE: Running ComBat-seq with multiple variables of interest# adjusted_counts <- ComBat_seq(# counts = as.matrix(counts), # Provide the counts matrix# batch = batch, # Provide the batch information# covar_mod = imp # Provide the variables of interest information# )``````{r after_combatseq, eval=run_combatseq}# Re-enter the adjusted count from ComBat-seq into DESeq# NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq abovedds_to_use <-DESeqDataSetFromMatrix(countData = adjusted_counts, # Provide the adjusted counts from ComBat-seqcolData = coldata, # Provide coldatadesign = formula # Provide design formula, notably without the covariates used in ComBat-seq)# Perform a variance-stabilized transformation on the ComBat-seq adjusted count datavsd_combat <-vst(dds_to_use, blind =FALSE)# Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrixnorm_matrix <-assay(vsd_combat)# Run degPCA to use prcomp and make a plotpca_combat <-degPCA(counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seqmetadata = coldata, # Provide metadatacondition = column # Provide the variable of interest to color the plot by) +ggtitle("After Combatseq") # Add title# Plot PCA from dgePCApca_combat +scale_color_grafify("kelly") # Apply color scale```# Differential ExpressionDifferential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. ```{r run-deseq2}# Perform differential expression analysis on the DDS objectde <-DESeq(dds_to_use)# Plot the dispersion for the dataDESeq2::plotDispEsts(de)```Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions.The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.```{r lfc-shrink}# NOTE: We recommend LRT for time series# resultsNames(de) # Check the order is right# Create a list that mirrors the contrasts in the contrasts list object where the pattern is:# <column_of_comparison>_<treatment_name>_vs_<control_name>names_to_use <-lapply(contrasts, function(contrast) { coef <-paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])})# Currently the contrasts list object doesn't have names for the items in the list# Assign the names from names_to_use to the names in the contrasts list objectnames(contrasts) <- names_to_use# Create a list called `de_list` to hold the differential expression information for each contrast# Go through each contrast in the contrasts list object andde_list <-lapply(contrasts, function(contrast) {# Extract the results (LFC, pvalue, padj, etc.) for the contrast resLFC <-results(de, contrast = contrast)# Create a string for the contrast following this pattern:# <column_of_comparison>_<treatment_name>_vs_<control_name> coef <-paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])# Apply LFC shrinkage with `ashr`# NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less. resLFCS <-lfcShrink(dds = de, # Provide the DDS objectcontrast = contrast, # Provide the contrasttype ="ash"# Use `ashr` to apply the shrinkage )# If you want to use `apeglm` instead of `ashr` then use:# resLFCS <- lfcShrink(# dds = de, # Provide the DDS object# coef = coef, # Provide the contrast# type = "apeglm" # Use `apeglm` to apply the shrinkage# )# Create a new results data frame with some extra information res <-as.data.frame(resLFCS) %>%# Extract the results from the resLFCS as a dataframerownames_to_column("gene_id") %>%# Move the rownames to a column of data called "gene_id"left_join(rdata, by ="gene_id") %>%# Combine the results with the rdata object by the gene_id and rdata holds information about each generelocate(gene_name) %>%# Move the "gene_name" column to be the first column dplyr::rename(lfc = log2FoldChange) %>%# Rename the column log2FoldChange to lfcmutate(pi =abs(lfc) *-log10(padj)) %>%# Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-valuearrange(-pi) # Arrange the data frame by the highest values of pi# Filter out genes that have no expression or were filtered out by DESEQ2 res <- res[res$baseMean >0, ] %>%# Remove genes with no expressiondrop_na(padj) %>%# Remove genes who don't have an adjusted p-value (lots of lowly expressed genes)drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)# Extract the significant genes res_sig <- res %>%filter(padj <0.05) %>%# Retain the values that have an adjusted p-value less than 0.05arrange(padj) %>%# Arrange the genes by the lowest adjusted p-valuemutate(gene_name =ifelse(test = gene_name ==""|is.na(gene_name), # If the gene_name value for a gene is NA or blank thenyes = gene_id, # Use the gene_id as the gene_nameno = gene_name # Maintain the gene_name ))# Create a list object called results results <-list(lfc = resLFC, # Assign the unshrunken results data to `lfc`lfcs = resLFCS, # Assign the shrunken results data to `lfcs`all = res, # Assign the full differential expression results to `all`sig = res_sig # Assign the significantly differentially expressed genes to `sig` )# Return the `results` list object to be a list within the object `de_list`return(results)})# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:# de_list <- c(de_list, new_comparison=list(lfc=resLFC,# lfcs=resLFCS,# all=res, sig=res_sig))```## MA plotThis plot can help to:- Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions.- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions.- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset.::: {.panel-tabset}```{r after_lfc_shrink, message=F, warning=F}#| results: asis#| fig-width: 4#| fig-height: 4# For each of the contrasts in the de_list list object to create MA plotsfor (contrast innames(de_list)) {# Create a tabset of the contrastcat("### ", contrast, "\n\n")# Use the unshrunken (lfc) counts for creating an MA plot p1 <-degMA(as.DEGSet(de_list[[contrast]]$lfc)) +ggtitle("Before LFC Shrinking") # Add title# Print the unshrunken counts MA plotprint(p1)# Use the shrunken (lfc) counts for creating an MA plot p2 <-degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit =2) +ggtitle("After LFC Shrinking") # Add titleprint(p2) # Print the shrunken counts MA plotcat("\n\n") # Print two new lines}```:::## Differentially Expressed Genes```{r}#| results: asisdt_list <-list()for (contrast innames(de_list)) { res_sig <- de_list[[contrast]][["sig"]] dt_list <-c( dt_list,list(h3(contrast)),list(DT::datatable(res_sig,options =list(scrollX =TRUE, autoWidth =TRUE ),class ="stripe hover" )) )}tagList(dt_list)```## Volcano plotThis volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen.::: {.panel-tabset}```{r}#| results: asis#| fig-height: 6for (contrast innames(de_list)) {cat("### ", contrast, "\n\n") res <- de_list[[contrast]][["all"]] res_mod <- res show <-as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) p1 <-EnhancedVolcano(res_mod,lab = res_mod$gene_name,pCutoff =0.05,selectLab =c(show$gene_name),FCcutoff =0.5,x ="lfc",y ="padj",title = contrast,col =c("darkgrey", "lightblue", "plum1", "purple"),subtitle ="",drawConnectors =TRUE,max.overlaps =Inf )print(p1)cat("\n\n")}```:::## Heatmap::: {.panel-tabset}```{r}#| results: asisfor (contrast innames(de_list)) {cat("### ", contrast, "\n\n") res_sig <- de_list[[contrast]][["sig"]] ma <- norm_matrix[res_sig$gene_id, ] colma <- coldata[, c(column), drop =FALSE] ma_colors <-lapply(colnames(colma), function(c) { l.col <- colors[1:length(unique(colma[[c]]))]names(l.col) <-sort(unique(colma[[c]])) l.col })names(ma_colors) <-colnames(colma) p1 <-pheatmap(ma,color =inferno(10),cluster_rows =TRUE,show_rownames =FALSE,annotation = colma,annotation_colors = ma_colors,border_color =NA,fontsize =10,scale ="row",fontsize_row =10,height =20 )print(p1)cat("\n\n")}```:::## Plot Top 16 Genes::: {.panel-tabset}```{r}#| results: asis#| fig-height: 6#| fig-width: 8n <-16for (contrast innames(de_list)) {cat("### ", contrast, "\n\n") res_sig <- de_list[[contrast]][["sig"]] top_n <- res_sig %>%slice_min(order_by = padj, n = n, with_ties =FALSE) %>% dplyr::select(gene_name, gene_id) top_n_exp <- norm_matrix %>%as.data.frame() %>%rownames_to_column("gene_id") %>%pivot_longer(cols =!gene_id, names_to ="sample", values_to ="log2_expression") %>%right_join(y = top_n, relationship ="many-to-many") %>%left_join(y = coldata, by ="sample") p1 <-ggplot(top_n_exp, aes_string(x = column, y ="log2_expression")) +geom_boxplot(outlier.shape =NA, linewidth =0.5, color ="grey") +geom_point() +facet_wrap(~gene_name) +ggtitle(str_interp("Expression of Top ${n} DEGs")) +theme(axis.text.x =element_text(angle =90, vjust =0.5, hjust =1))print(p1)cat("\n\n")}```:::# Pathway EnrichmentFrom the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. ```{r setup-fa}# Call the appropriate functional analysis databases for your analysisif (grepl("hg|human", genome, ignore.case =TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then all_in_life <-get_databases_v2(sps ="human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <-TRUE# Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run} elseif (grepl("mm|mouse", genome, ignore.case =TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then all_in_life <-get_databases_v2(sps ="mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory run_FA <-TRUE# Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run} else { # If your organism isn't mouse or human thenwarning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message run_FA <-FALSE# Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run}```# Pathway Analysis- GSEAGene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA.- Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes.- Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge.- Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes.**In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.**```{r run-gsea, warning=F, message=F, eval=run_FA}# Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the listfa_gsea_list <-lapply(de_list, function(contrast) {# For a given contrast pull out the full differential expression results res <- contrast[["all"]]# Creating the input for the GSEA analysis gsea_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj)) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc"# Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- AnnotationDbi::select(x = ann.org, # Annotation database to searchkeys = gsea_input$gene_id, # The gene_ids to search forkeytype ="ENSEMBL", # The gene_ids (keys) are ENSEMBL annotationscolumns =c("ENTREZID", "SYMBOL") ) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol# Combine gene annotation with the lfc information input_entrezid <-inner_join(x = gsea_input, # Combine the ENSEMBL ID and lfc input withy = input_entrezid, # The ENTREZ ID and gene symbol informationby =c("gene_id"="ENSEMBL") ) %>%# Join these table together by common ENSEMBL IDsfilter(!is.na(ENTREZID)) %>%# Remove the genes without an ENTREZ IDdistinct(ENTREZID, .keep_all =TRUE) %>%# Retain the unique ENTREZ IDs and all columns of dataarrange(desc(lfc)) # Arrange the data frame by descending values of lfc# Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb tb <-run_fgsea_v2(input = input_entrezid, # Query for the GSEA analysisall_in_life = all_in_life ) # Annotation databases to query against tb %>%filter(padj <0.05) %>%arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them})``````{r print-gsea, results='asis', eval=run_FA}# NOTE: DT::datatables doesn't work with tabset and for loops# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Assign the significantly enriched pathways to the object res_sig res_sig <- fa_gsea_list[[contrast]] %>%# Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05.filter(padj <0.05) %>%arrange(padj) # Filter and sort for adjusted p-values of less than 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways )}# Render the dt_list into HTMLtagList(dt_list)```# Pathway Analysis- Over-representationOver-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA:- Simplicity: Easy to perform and interpret.- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.```{r run-ora, warning=F, message=F, eval=run_FA}# Go through each contrast in the de_list list object andfa_list <-lapply(de_list, function(contrast) {# For a given contrast pull out the full differential expression results res <- contrast[["all"]]# Extract the universe of genes to the ORA on and assign it to the object universe universe <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj)) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list objectpull(gene_id) # Extract the gene_id column# Subset the full annotations to only contain universe we will evaluate within universe_mapping <- rdata %>%# Start with the full annotationsfilter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID# AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL'))# Create the input vector of genes for the ORA with a positive or negative lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, abs(lfc) >0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (ajusted p-value less than 0.01, lfc greater than the absolute value of 0.3) and have an ENTREZ ID# AnnotationDbi::select(ann.org, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory all <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create the input vector of genes for the ORA with a positive lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, lfc >0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is greater than 0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc greater than 0.3) and have an ENTREZ ID# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory up <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create the input vector of genes for the ORA with a negative lfc ora_input <- res %>%# Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)filter(!is.na(padj), padj <0.01, lfc <-0.3) %>%# Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the lfc is less than -0.3pull(gene_id) # Extract the gene ID# Create the ENTREZ ID input for the ORA# NOTE: Change to the correct species if not working in human or mouse input_entrezid <- rdata %>%# Start with the full annotationsfilter(gene_id %in% ora_input, !is.na(entrez)) # Only retain the the annotations that are are in the ora_input object (adjusted p-value less than 0.01 and lfc less than -0.3) and have an ENTREZ ID# Run the ORA using the run_fora_v2 function within the FA.R file in the 00_libs directory down <-run_fora_v2(input = input_entrezid, # Data frame of genes that are being evaluated for enrichmentuni = universe_mapping, # Universe of gene to look for enrichment withinall_in_life = all_in_life ) # Annotation databases to query against# Create a list to hold the ORA results for a given contrastlist(all = all, # Assign the positive and negative lfc combined ORA results to "all"up = up, # Assign the positive lfc combined ORA results to "up"down = down # Assign the negative lfc combined ORA results to "down" )})```## All significant genes```{r}#| results: asis#| eval: !expr run_FA# NOTE DT::datatables doesn't work with tabset and for loops# You can use the following code to print dynamically or call manually sanitize_datatable() multiple timesdt_list <-list()for (contrast innames(de_list)) { res_sig <- fa_list[[contrast]][["all"]] %>%filter(padj <0.05) %>%arrange(padj) # Filter and sort dt_list <-c( dt_list,list(h3(contrast)),list(sanitize_datatable(res_sig)) )}tagList(dt_list)```## Down-regulated genes ```{r print-ora-down}#| results: asis#| eval: !expr run_FA# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Create a dataframe to hold the significantly over-represented terms for genes with a negative lfc res_sig <- fa_list[[contrast]][["down"]] %>%# Start with the complete list of terms used in the ORA evaluated for negative lfcfilter(padj <0.05) %>%arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a negative lfc )}# Render the dt_list into HTML for the significantly over-represented terms for genes with a negative lfctagList(dt_list)```## Up-regulated genes```{r print-ora-up}#| results: asis#| eval: !expr run_FA# Create an empty list to assign contrast values and DT data tables todt_list <-list()# For each of the contrasts in the de_list list object to extract the name of the contrast and the ORA results to be rendered into an HTML tablefor (contrast innames(de_list)) {# Create a dataframe to hold the significantly over-represented terms for genes with a positive lfc res_sig <- fa_list[[contrast]][["up"]] %>%# Start with the complete list of terms used in the ORA evaluated for positive lfcfilter(padj <0.05) %>%arrange(padj) # Filter and sort only the results with adjusted p-values of 0.05# Populate the dt_list list dt_list <-c( dt_list, # With the contents of dt_list from the previous looplist(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 sizelist(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly over-represented terms for genes with a positive lfc )}# Render the dt_list into HTML for the significantly over-represented terms for genes with a positive lfctagList(dt_list)```# Save files```{r write-files}# If the subset value from the params in the YAML is not null thenif (!is.null(subset_value) &!is.null(subset_column)) {# Interpret the value of subset_value and assign it to the object called filenames filenames <-str_interp("${subset_value}")} else { # If the subset value from the params in the YAML is null then# Assign the calue of "full" to the object called filenames filenames <-"full"}# Assign the value of filenames to a new object called filenamefilename <-paste0(filenames)# Assign the file path to save the file path for the expression data CSV to the object name_expression_fnname_expression_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_expression.csv") # Interpret the value of "filename" and insert that in front of "_expression.csv")# Extract the variance stabilized transformed counts matrix into a data frame to be savedcounts_norm <- norm_matrix %>%# Start with the variance stabilized transformed counts matrixas.data.frame() %>%# Convert the matrix to a data frame# Convert the row names (ENSEMBL IDs) to a column called "gene_id"rownames_to_column("gene_id") %>%left_join(rdata, by ="gene_id")# Write the variance stabilized transformed counts matrix to a CSV file with the name_expression_fn pathwrite_csv(counts_norm, name_expression_fn)# For each of the contrasts in the de_list list object to save the differential expression analysis and ORA to their respective CSV filesfor (contrast innames(contrasts)) {# Create a file name for the differential expression analysis name_deg_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_${contrast}_deg.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_deg.csv" )# Create a file name for the ORA analysis name_pathways_fn <-file.path( # Construct a file path basedir, # Base directory path to write output files to specificied in the params.R file within the 00_params directorystr_interp("${filename}_${contrast}_pathways.csv") # Interpret the value of "filename" and "contrast", then insert that in front of "_pathways.csv" )# Assign the full differential expression analysis for a given contrast to the object called res_for_writing res_for_writing <- de_list[[contrast]][["all"]] %>%# Start with the full differential expression analysis dataframe for the given contrastmutate(comparison = contrast) # Add a column to the table with the given contrast# Assign the full ORA for positive and negative lfc for a given contrast to the object called pathways_for_writing# NOTE: Choose what ORA to save (all, down or up). To save everything, add more lines of this code pathways_for_writing <- fa_list[[contrast]][["all"]] %>%# Start with the full ORA dataframe evaluating negative and positive lfc for the given contrastmutate(comparison = contrast) # Add a column to the table with the given contrast# Write the full differential expression analysis for a given contrast to a CSV file with the name_deg_fn pathwrite_csv(res_for_writing, name_deg_fn)# Write the full ORA for a given contrast to a CSV file with the name_pathways_fn pathwrite_csv(pathways_for_writing, name_pathways_fn)}```If you would like to use `DEGpatterns.Rmd` template to visualize trends of gene changes across conditions/groups, please save the following output:```{r save}# Save the DESeq object to an RDS filesaveRDS(dds_to_use, "DEGpattern_deseq_obj.rds")# Save the metadata to an RDS filesaveRDS(coldata, "deseq_coldata.rds")# Set the adjusted p-value threshold to 0.05padj.cutoff <-0.05# Assign a value for the top n genes to XXXtopN <-1000# Go through each contrast in the de_list list object and extract the significance values for the significant genes for top N significant genes# This will go through the de_list object, assign the name of the contrast in the de_list to contrast and assign its associated differential expression list (containing lfc, lfcs, all and sig) to the object contrast_dedeg_List <-imap(de_list, \(contrast_de, contrast){# Extract the significant genes from a given contrast and assign them to res_LRT res_LRT <- contrast_de[["sig"]]# Create a named vector where the values in the vector are the adjusted p-values and the names are the index of the value gene_padj <-setNames(res_LRT$padj, rownames(res_LRT))# Subset the gene_padj vector by the adjusted p-value threshold established in the top of this code block gene_sig <- gene_padj[gene_padj < padj.cutoff]# Arrange the gene significance values with the most significant first (they should already be arranged this way though), then keep either the first N genes defined by topN at the top of this code block or the total number of significant genes, whichever is fewer and assign it to the object called degs degs <-sort(gene_sig)[1:min(length(gene_sig), topN)]# Save the RDS object for the significance values for a given contrast, adjusted p-value threshold and number of top N genessaveRDS(degs, glue("DEGpattern_deseq_{contrast}_DEGs_padj{padj.cutoff}_topN{topN}.rds"))# Populate the deg_List with the degs vector for each contrastreturn(degs)})# Please note, you can choose to combine all significant results across contrast as well```# Conclusion# Methods## R package references```{r citations, results='asis'}# Provide citationscitation("DESeq2")citation("ggplot2")citation("pheatmap")```## R sessionList and version of tools used for the DE report generation.```{r}# Print sessionInfo()sessionInfo()```