co-expression with imputation

Author

Harvard Chan Bioinformatics Core

Published

August 29, 2025

Created output directory: ./test_out 
Code
# Filtration parameters
filter_gene <- FALSE
min_exp <- 0.2
min_cells <- 40
min_perc <- 0.2

This report intends to both calculate correlations of gene of interests and also compares how imputation changes those values.

Throughout this report, we assess Spearman correlation estimates.

We further assessed two ways of imputating gene expression values: MAGIC and SAVER.

More details about the method can be found in the section Method description.

Overview of project

  • Path seurat: https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds
  • Metadata column with cell group label: integrated_snn_res.0.4
  • Filter genes based on expression and frequency: FALSE

List of all gene of interests:

Code
corr_genes_all <- c(
  "COL1A1", "COL1A2", "DCN",
  "EBF2", "IL33", "PRX", "LY6G6C"
)
DT::datatable(data.frame(gene = corr_genes_all))

Starting off with 7 genes of interest.

Load seurat object

Code
seurat <- inputRead(seurat_obj)
Idents(seurat) <- group_column
seurat[["SCT"]] <- NULL
raw_rna <- GetAssayData(object = seurat[["RNA"]], layer = "counts")
genes.use <- rowSums(raw_rna) > 0
genes.use <- names(genes.use[genes.use])
seurat <- seurat[genes.use, ]
n_cells <- ncol(seurat)

From the genes that were supplied, removing any that are not expressed in this dataset.

Code
corr_genes <- corr_genes_all[corr_genes_all %in% genes.use]

Next we look at the basic distribution of remaining genes of interest in terms of expression and number of cells they are expressed in.

These are the following filtration metrics that are set:

  • Average expression < 0.2
  • Number of cells expressed in > 40
  • Percentage of cells expressed in > 0.2

Filtration parameter was set to FALSE. If FALSE, no further filtration will be done at this step.

Code
data_rna <- FetchData(seurat[["RNA"]], vars = corr_genes)

# Number of cells a gene is expressed in
num_cells <- colSums(data_rna > 0)
# Percentage of cells a gene is expressed in
perc_cells <- num_cells / ncol(seurat)
# Average expression of a gene
avg_expression <- colMeans(data_rna)

df_genes <- data.frame(num_cells, perc_cells, avg_expression)
df_genes <- df_genes %>%
  mutate(filter = !((perc_cells > min_perc) &
    (num_cells > min_cells) &
    (avg_expression > min_exp)))
df_genes$gene <- row.names(df_genes)
df_genes <- df_genes %>% arrange(desc(avg_expression), desc(perc_cells))
df_genes %>%
  select(-gene) %>%
  magrittr::set_colnames(c(
    "N_Cell_Expressed", "Perc_Cell_Expressed",
    "Avg_Expression", "Filter"
  )) %>%
  DT::datatable() %>%
  DT::formatRound(columns = c("Perc_Cell_Expressed", "Avg_Expression"), digits = 3)
Code
df_genes %>%
  ggplot(aes(x = perc_cells, y = avg_expression, color = filter)) +
  geom_point(size = rel(3)) +
  scale_color_manual(values = c("black", "darkred")) +
  theme_minimal() +
  labs(x = "Percent of cells expressing the gene", y = "Average expression")

Code
if (filter_gene == TRUE) {
  corr_genes <- (df_genes %>% subset(filter == FALSE))$gene
}

7 genes of interest remaining.

Imputation and normalization

We compare three alternative methods of estimating expression levels to log normalization and assess their ability to account for dropout.

  1. SCTransform (raw counts -> normalized counts)
  2. MAGIC (raw counts -> imputed, normalized counts)
  3. SAVER (raw counts -> imputed, normalized counts)
Code
# Store output so we don't have to re-run imputation each time
filename <- glue("{path_outs}/imputed.RDS")
if (!file.exists(filename)) {
  # Get raw counts (genes x cells matrix)
  raw_rna <- as.matrix(GetAssayData(seurat[["RNA"]], slot = "counts"))

  # SCT
  # Re-run SCT on subset data
  seurat <- SCTransform(seurat, return.only.var.genes = FALSE, min_cells = 1)

  # Creating new seurat object for genes of interest only
  # Use GetAssayData to reliably extract counts/data (features x cells)
  data_raw <- as.matrix(GetAssayData(seurat[["RNA"]], slot = "counts")[corr_genes, , drop = FALSE])
  data_rna <- as.matrix(GetAssayData(seurat[["RNA"]], slot = "data")[corr_genes, , drop = FALSE])
  data_sct <- as.matrix(GetAssayData(seurat[["SCT"]], slot = "data")[corr_genes, , drop = FALSE])

  seurat_imputed <- CreateSeuratObject(
    counts = data_raw,
    data = data_rna,
    meta.data = seurat@meta.data
  )
  seurat_imputed[["SCT"]] <- CreateAssayObject(data = data_sct)
  seurat_imputed[["RAW"]] <- CreateAssayObject(counts = raw_rna)

  # Delete the original seurat object to save memory
  rm(seurat)

  data_magic <- magic(t(raw_rna), genes = corr_genes)$result
  # magic may return a data.frame/tibble or matrix (cells x genes). Force to matrix and transpose to features x cells
  if (!is.matrix(data_magic)) data_magic <- as.matrix(data_magic)
  data_magic_t <- t(data_magic)
  # ensure dimnames align: rows = genes, cols = cells
  if (is.null(rownames(data_magic_t))) rownames(data_magic_t) <- corr_genes
  if (is.null(colnames(data_magic_t))) colnames(data_magic_t) <- colnames(raw_rna)
  seurat_imputed[["MAGIC"]] <- CreateAssayObject(data = data_magic_t)

  # SAVER
  # Generate SAVER predictions for those genes
  genes.ind <- which(rownames(raw_rna) %in% corr_genes)
  saverD <- raw_rna
  attr(saverD, "class") <- "matrix"
  data_saver <- saver(saverD,
    pred.genes = genes.ind,
    pred.genes.only = TRUE,
    estimates.only = TRUE,
    ncores = parallel::detectCores() - 2
  )

  seurat_imputed[["SAVER"]] <- CreateAssayObject(data = data_saver)

  saveRDS(seurat_imputed, filename)
}
Calculating MAGIC...
  Running MAGIC on 1890 cells and 18854 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 2.51 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.36 seconds.
    Calculating affinities...
    Calculated affinities in 0.35 seconds.
  Calculated graph and diffusion operator in 3.21 seconds.
  Calculating imputation...
Calculated MAGIC in 3.23 seconds.
Code
seurat_imputed <- readRDS(filename)

Average expression for each method

Code
assays <- c("RNA", "SCT", "MAGIC", "SAVER")

df_avg <- data.frame(gene = corr_genes)
for (assay in assays) {
  data <- GetAssayData(object = seurat_imputed[[assay]], layer = "data")
  avg <- data.frame(rowMeans(data))
  colnames(avg) <- assay
  avg$gene <- row.names(avg)
  df_avg <- left_join(df_avg, avg, by = "gene")
}

pheatmap(df_avg %>% column_to_rownames(var = "gene"),
  scale = "column",
  cluster_col = TRUE, cluster_row = TRUE,
  show_rownames = TRUE,
  angle_col = 0, color = inferno(10)
)

Correlation Estimates

We have a few different ways to compute correlation scores with their associated p-values:

  1. Spearman correlation
  • SCTransform counts -> spearman correlation matrix
  • MAGIC imputed -> spearman correlation matrix
  • SAVER imputed -> spearman correlation matrix
  1. (removed) CS-CORE (this report no longer runs CS-CORE)
Code
# Store output so we don't have to re-run correlation each time
filename <- glue("{path_outs}/corr.csv")

if (!file.exists(filename)) {
  # Compute spearman correlation for each method (except CS-CORE which is run later)
  # Unique combination of each gene pair
  genes_comb <- data.frame(t(combn(corr_genes, 2)))
  n_comb <- nrow(genes_comb)

  # Create dataframe with correlation and p-values scores
  df_corr <- genes_comb %>% rename("Var1" = X1, "Var2" = X2)
  df_corr[assays] <- NA
  df_p_val <- df_corr

  for (idx in 1:n_comb) {
    if (idx %% 200 == 0) {
      print(glue("{idx}/{n_comb} correlations computed."))
    }

    # Name of genes to run correlation on
    gene_1 <- genes_comb[idx, 1]
    gene_2 <- genes_comb[idx, 2]

    for (assay_ in assays) {
      # extract assay data safely (features x cells) and subset to the two genes
      assay_mat <- tryCatch(as.matrix(GetAssayData(seurat_imputed[[assay_]], slot = "data")), error = function(e) NULL)
      if (is.null(assay_mat)) {
        gene_exp <- data.frame()
      } else {
        sub_mat <- assay_mat[c(gene_1, gene_2), , drop = FALSE]
        # transpose to cells x genes for cor.test
        gene_exp <- as.data.frame(t(sub_mat))
      }

      if (all(gene_exp[[gene_1]] == 0) | all(gene_exp[[gene_2]] == 0)) {
        corr_val <- 0.0
        p_val <- 1.0
      } else {
        # Calculate spearman correlation and p-value otherwise
        tmp <- cor.test(gene_exp[[gene_1]], gene_exp[[gene_2]],
          method = "spearman", exact = FALSE
        )
        corr_val <- as.numeric(unname(tmp$estimate))
        p_val <- as.numeric(tmp$p.value)
      }

      # Store correlation and p-values
      df_corr[idx, assay_] <- corr_val
      df_p_val[idx, assay_] <- p_val
    }
  }

  # CS-CORE removed: no additional co-expression estimates are appended here

  # Save output
  write.csv(df_corr, filename)
  write.csv(df_p_val, glue("{path_outs}/p_corr.csv"))
}

df_corr <- read.csv(filename, row.names = 1)
df_p_val <- read.csv(glue("{path_outs}/p_corr.csv"), row.names = 1)

Heatmap of correlation estimates

Showing the patterns of correlation for each method. The x-axis and y-axis are the genes of interest with the corresponding correlation value for the pair as the value. Keep in mind that this is symmetric matrix.

Code
methods <- c("RNA", "SCT", "MAGIC", "SAVER")

cor_List <- purrr::map(methods, \(method){
  corr <- df_corr[c("Var1", "Var2", method)]
  corr_cp <- corr %>% rename(Var1 = Var2, Var2 = Var1)
  corr <- rbind(corr, corr_cp)
  mtx <- reshape2::dcast(corr, Var2 ~ Var1) %>% column_to_rownames("Var2")

  # Set the diagonal values: Correlation = 1, p-value = 1
  mtx <- as.matrix(mtx)
  diag(mtx) <- 1

  breaks <- seq(-1, 1, by = 0.1)
  show_legend <- F
  p <- pheatmap(mtx,
    color = inferno(10),
    show_rownames = FALSE,
    show_colnames = FALSE,
    breaks = breaks,
    main = method,
    silent = T,
    legend = show_legend,
    fontsize = rel(8)
  )
  return(p[[4]])
})

cor_comb <- grid.arrange(grobs = cor_List, nrow = 2, ncol = 3)

Code
plot(cor_comb)

Compare correlation estimates across methods

Comparing the correlation scores for each gene pair for MAGIC and SAVER.

In these scatterplots, the gene-pairs that are colored red have different results for significance.

Code
methods <- c("MAGIC", "SAVER")
methods_comb <- data.frame(t(combn(methods, 2)))
plot_list <- list()

for (idx in 1:nrow(methods_comb)) {
  method_1 <- methods_comb[idx, 2]
  method_2 <- methods_comb[idx, 1]

  corr <- df_corr[c("Var1", "Var2", method_1, method_2)]
  p_val <- df_p_val[, c("Var1", "Var2", method_1, method_2)]
  corr$sig_1 <- p_val[[method_1]]
  corr$sig_2 <- p_val[[method_2]]

  corr <- corr %>% mutate(sig = (sig_1 < 0.5) & (sig_2 < 0.05))

  p <- ggplot(corr) +
    geom_point(aes(
      x = get(method_1), y = get(method_2),
      color = sig
    ), size = rel(3)) +
    theme_bw() +
    NoLegend() +
    scale_color_manual(values = c("FALSE" = "#9D0208", "TRUE" = "black")) +
    labs(x = method_1, y = method_2, title = paste(method_1, "vs", method_2)) +
    theme(plot.title = element_text(size = rel(1.5))) +
    ylim(-1, 1) +
    xlim(-1, 1) +
    geom_abline(slope = 1, intercept = 0, color = "#63bff0")

  plot_list[[idx]] <- ggplotGrob(p)
}

grid.arrange(grobs = plot_list, nrow = 3)

Method description

MAGIC is an algorithm for denoising high-dimensional data most commonly applied to single-cell RNA sequencing data. MAGIC learns the manifold data, using the resultant graph to smooth the features and restore the structure of the data.

SAVER implements a regularized regression prediction and empirical Bayes method to recover the true gene expression profile in noisy and sparse single-cell RNA-seq data.

R session

List and version of tools used for the QC report generation.

Code
sessionInfo()
R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] Rmagic_2.0.3.999   Matrix_1.7-3       SAVER_1.1.2        viridis_0.6.5     
 [5] viridisLite_0.4.2  RColorBrewer_1.1-3 gridExtra_2.3      pheatmap_1.0.13   
 [9] Seurat_5.3.0       SeuratObject_5.1.0 sp_2.2-0           glue_1.8.0        
[13] knitr_1.50         lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1     
[17] dplyr_1.1.4        purrr_1.1.0        readr_2.1.5        tidyr_1.3.1       
[21] tibble_3.3.0       ggplot2_3.5.2      tidyverse_2.0.0   

loaded via a namespace (and not attached):
  [1] jsonlite_2.0.0         magrittr_2.0.3         spatstat.utils_3.1-4  
  [4] farver_2.1.2           rmarkdown_2.29         vctrs_0.6.5           
  [7] ROCR_1.0-11            spatstat.explore_3.4-3 htmltools_0.5.8.1     
 [10] sass_0.4.10            sctransform_0.4.2      parallelly_1.45.0     
 [13] KernSmooth_2.23-26     bslib_0.9.0            htmlwidgets_1.6.4     
 [16] ica_1.0-3              plyr_1.8.9             cachem_1.1.0          
 [19] plotly_4.11.0          zoo_1.8-14             igraph_2.1.4          
 [22] mime_0.13              lifecycle_1.0.4        iterators_1.0.14      
 [25] pkgconfig_2.0.3        R6_2.6.1               fastmap_1.2.0         
 [28] fitdistrplus_1.2-4     future_1.58.0          shiny_1.10.0          
 [31] digest_0.6.37          patchwork_1.3.1        rprojroot_2.1.0       
 [34] tensor_1.5.1           RSpectra_0.16-2        irlba_2.3.5.1         
 [37] crosstalk_1.2.1        labeling_0.4.3         progressr_0.15.1      
 [40] spatstat.sparse_3.1-0  timechange_0.3.0       httr_1.4.7            
 [43] polyclip_1.10-7        abind_1.4-8            compiler_4.5.1        
 [46] here_1.0.1             doParallel_1.0.17      withr_3.0.2           
 [49] fastDummies_1.7.5      R.utils_2.13.0         MASS_7.3-65           
 [52] rappdirs_0.3.3         tools_4.5.1            lmtest_0.9-40         
 [55] httpuv_1.6.16          future.apply_1.20.0    goftest_1.2-3         
 [58] R.oo_1.27.1            nlme_3.1-168           promises_1.3.2        
 [61] grid_4.5.1             Rtsne_0.17             cluster_2.1.8.1       
 [64] reshape2_1.4.4         generics_0.1.4         gtable_0.3.6          
 [67] spatstat.data_3.1-6    tzdb_0.5.0             R.methodsS3_1.8.2     
 [70] data.table_1.17.8      hms_1.1.3              spatstat.geom_3.4-1   
 [73] RcppAnnoy_0.0.22       ggrepel_0.9.6          RANN_2.6.2            
 [76] foreach_1.5.2          pillar_1.10.2          spam_2.11-1           
 [79] RcppHNSW_0.6.0         later_1.4.2            splines_4.5.1         
 [82] lattice_0.22-6         survival_3.8-3         deldir_2.0-4          
 [85] tidyselect_1.2.1       miniUI_0.1.2           pbapply_1.7-2         
 [88] scattermore_1.2        xfun_0.52              matrixStats_1.5.0     
 [91] DT_0.33                stringi_1.8.7          lazyeval_0.2.2        
 [94] yaml_2.3.10            evaluate_1.0.3         codetools_0.2-20      
 [97] cli_3.6.5              uwot_0.2.3             xtable_1.8-4          
[100] reticulate_1.42.0      jquerylib_0.1.4        Rcpp_1.1.0            
[103] globals_0.18.0         spatstat.random_3.4-1  png_0.1-8             
[106] spatstat.univar_3.1-4  parallel_4.5.1         dotCall64_1.2         
[109] listenv_0.9.1          scales_1.4.0           ggridges_0.5.6        
[112] rlang_1.1.6            cowplot_1.1.3