Benchmarking DE Analysis with scDesign3

Dongyuan Song

Bioinformatics IDP, University of California, Los Angeles
dongyuansong@ucla.edu

Qingyang Wang

Department of Statistics, University of California, Los Angeles
qw802@g.ucla.edu

15 September 2023

Source: ../../scDesign3/code/vignettes/scDesign3-DEanalysis-vignette.Rmd

scDesign3-DEanalysis-vignette.Rmd

library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
library(DuoClustering2018)
library(Seurat)
library(SeuratObject)
library(scran)
library(parallel)
library(monocle3)
library(DESeq2)
library(BiocParallel)
library(NBAMSeq)
library(PseudotimeDE)
library(dplyr)
library(tradeSeq)
library(reshape2)
library(spatialDE)
library(SPARK)
library(scales)
library(tidyr)
theme_set(theme_bw())

In this tutorial, we will demonstrate how to use scDesign3 to generate negative control and benchmark methods for identifying differentially expressed (DE) genes between discrete cell types or continuous trajectory from scRNA-seq data, and identifying spatially variable genes (SVG) from spatial transcriptomics data. Please note that here we only did a very brief benchmarking on a few methods for illustration purpose, not for formal comparison.

Identification of DE genes between discrete cell types

Read in the reference data

The raw data is from the R package DuoClustering2018 which contain a set of datasets with various clustering results.

Zhengmix4eq_sce <- get("sce_filteredExpr10_Zhengmix4eq")(metadata = FALSE)

The top 200 highly variable genes are kept for generating synthetic data.

ngene <- 200
logcounts(Zhengmix4eq_sce) <- log1p(counts(Zhengmix4eq_sce))
zheng_sce <- modelGeneVar(Zhengmix4eq_sce)
chosen <- getTopHVGs(zheng_sce, n = ngene)
example_sce <- Zhengmix4eq_sce[chosen,]

We extract out B cells and regulatory T cells only and use all cells from these two cell types to simulate synthetic data.

selected_cells <- which(colData(example_sce)$phenoid %in% c("b.cells","regulatory.t"))
example_sce <- example_sce[,selected_cells]
colData(example_sce)$cell_type <- as.factor(colData(example_sce)$phenoid)
example_sce
head(colData(example_sce))

Simulation

We use the step-by-step functions instead of the one-shot function to generate synthetic data since these step-by-step functions allow us to alter estimated parameters and generate new data based on our desired parameters.

set.seed(123)
example_data <- construct_data(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
  )

example_marginal <- fit_marginal(
    data = example_data,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "1",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE
  )

set.seed(123)
example_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = example_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = example_data$dat
  )

example_para <- extract_para(
    sce = example_sce,
    marginal_list = example_marginal,
    n_cores = 2,
    family_use = "nb",
    new_covariate = example_data$newCovariate,
    data = example_data$dat
  )

Here, we examine the mean_mat, which is one of the outputs from the previous function extract_para(). For each gene, we calculate the difference in the between the maximum mean parameter and minimum mean parameter across all cells. We select genes which the gene’s mean difference across cells are in the top 50 largest differences. We regard these genes as DE genes. Then, we manually set the mean parameters of the rest genes to be the same across all cells. We regard all genes with the same mean parameter across cells as non-DE genes. Of course, this is a very flexible step and users may choose other ideas to modify the mean matrix.

diff <- apply(example_para$mean_mat, 2, function(x){max(x,na.rm = TRUE)-min(x,na.rm = TRUE)})
diff_ordered <- order(diff, decreasing = TRUE)
diff <- diff[diff_ordered]
num_de <- 50
de_idx <- names(diff[1:num_de])
non_de_idx <- names(diff[-(1:num_de)])
non_de_mat <- apply(example_para$mean_mat[,non_de_idx], 2, function(x){
  avg <- (max(x,na.rm = TRUE)+min(x,na.rm = TRUE))/2
  new_mean <- rep(avg, length(x))
  return(new_mean)
})
example_para$mean_mat[,non_de_idx] <- non_de_mat

set.seed(123)
 example_newcount <- simu_new(
    sce = example_sce,
    mean_mat = example_para$mean_mat,
    sigma_mat = example_para$sigma_mat,
    zero_mat = example_para$zero_mat,
    quantile_mat = NULL,
    copula_list = example_copula$copula_list,
    n_cores = 1,
    family_use = "nb",
    input_data = example_data$dat,
    new_covariate = example_data$newCovariate,
    important_feature = example_copula$important_feature,
    filtered_gene = example_data$filtered_gene
  )

DE genes identification

Then, we follow Seurat’s pipeline to preprocess the simulated data.

seurat_obj <- CreateSeuratObject(counts = example_newcount, project = "seurat_obj", min.cells = 0, min.features = 0)
seurat_obj[["percent.mt"]] <- PercentageFeatureSet(seurat_obj, pattern = "^MT-")
seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 10000)
seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "vst",nfeatures = 200)
all.genes <- rownames(seurat_obj)
seurat_obj <- ScaleData(seurat_obj, features = all.genes)
seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj))
seurat_obj <- JackStraw(seurat_obj, num.replicate = 100)
seurat_obj <- ScoreJackStraw(seurat_obj, dims = 1:20)

Since we already have the ground truth cell type annotations for our simulated dataset, we can directly use the cell type annotations we have instead of running FindClusters from Seurat to avoid the double-dipping issue.

seurat_obj <- FindNeighbors(seurat_obj, dims = 1:10)
ct <- colData(example_sce)$cell_type
names(ct) <- colnames(example_sce)
seurat_obj[["cell_type"]] <- ct
Idents(seurat_obj) <- "cell_type"
seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)

Then, we follow Seurat’s tutorial to conduct DE test.

test <- c("wilcox", "bimod", "t", "poisson", "negbinom", "LR", "MAST", "DESeq2")
qvals <- matrix(0, nrow = dim(seurat_obj)[1], ncol = length(test))
for (x in 1:length(test)) {
  markers <- FindMarkers(seurat_obj, ident.1 = "b.cells", ident.2 = "regulatory.t", test.use = test[x],
                         logfc.threshold = 0, min.pct = 0, min.cells.feature = 1, min.cells.group = 1)
  qvals[,x] <- p.adjust(markers[rownames(seurat_obj),"p_val"], method = "BH", length(rownames(seurat_obj)))
}
colnames(qvals) <- test
rownames(qvals) <- rownames(seurat_obj)

Since we manually created non-DE genes in the extra_para() step, now we can calculate the actual false discovery proportion(FDP) and power of the DE tests we conducted above with various target FDR threshold.

targetFDR <- c(seq(0.01,0.1,by=0.01),seq(0.2,0.5,by=0.1))
de <-de_idx
fdp_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(fdp_mat) <- test
rownames(fdp_mat) <- targetFDR
power_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(power_mat) <- test
rownames(power_mat) <- targetFDR

for (t in 1:length(test)) {
  curr_p = qvals[,t]
  for (i in 1:length(targetFDR)) {
    thre <- targetFDR[i]
    discovery <- which(curr_p <= thre)
    tp <- length(intersect(names(discovery),de))
    if(length(discovery) == 0){
      fdp <- 0
    }else{
      fdp <- (length(discovery) - tp)/length(discovery)
    }
    power <- tp/length(de)
    fdp_mat[i, t] <- fdp
    power_mat[i,t] <- power
  }
}

Lastly, we visualize the Target FDR vs Actual FDP and Target FDR vs Power below.

fdp_long <- melt(fdp_mat)
colnames(fdp_long) <- c("Target FDR","test_method","Actual FDP")
fdp_plot <- ggplot(fdp_long) +
  geom_line(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_abline(intercept = 0, slope=1,linetype="dashed",color="grey")+ 
 theme(aspect.ratio = 1) + expand_limits(x = 0, y = c(0,1))
fdp_plot

power_long <- melt(power_mat)
colnames(power_long) <- c("Target FDR","test_method","Power")
power_plot <- ggplot(power_long) +
  geom_line(aes(x=`Target FDR`, y=Power,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=Power,color=test_method))+
  theme(aspect.ratio = 1)
power_plot

Identification of DE genes along a trajectory

Read in the reference data

The raw data is from the scvelo, which describes pancreatic endocrinogenesis. We pre-select the top 1000 highly variable genes and filter out some cell types to ensure a single trajectory.

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581992")))
print(example_sce)
#> class: SingleCellExperiment 
#> dim: 1000 2087 
#> metadata(5): clusters_coarse_colors clusters_colors day_colors
#>   neighbors pca
#> assays(6): X spliced ... cpm logcounts
#> rownames(1000): Pyy Iapp ... Eya2 Kif21a
#> rowData names(1): highly_variable_genes
#> colnames(2087): AAACCTGAGAGGGATA AAACCTGGTAAGTGGC ... TTTGTCAAGTGACATA
#>   TTTGTCAAGTGTGGCA
#> colData names(7): clusters_coarse clusters ... sizeFactor pseudotime
#> reducedDimNames(4): X_pca X_umap PCA UMAP
#> mainExpName: NULL
#> altExpNames(0):

To save computational time, we only use the top 200 genes.

example_sce <- example_sce[1:200, ]

Simulation

set.seed(1)
PANCREAS_data <- construct_data(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = "pseudotime",
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
  )

PANCREAS_marginal <- fit_marginal(
    data = PANCREAS_data,
    predictor = "gene",
    mu_formula = "s(pseudotime, k = 10, bs = 'cr')",
    sigma_formula = "s(pseudotime, k = 5, bs = 'cr')",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE
  )

set.seed(1)
PANCREAS_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = PANCREAS_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = PANCREAS_data$dat
  )

PANCREAS_para <- extract_para(
    sce = example_sce,
    marginal_list = PANCREAS_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = PANCREAS_data$newCovariate,
    data = PANCREAS_data$dat
  )

Here, we examine the mean_mat, which is one of the outputs from the previous function extract_para(). For each gene, we calculate the difference in the between the maximum mean parameter and minimum mean parameter across all cells. We select genes which the gene’s mean difference across cells are in the top 50 largest differences. We regard these genes as DE genes(DEG). Then, we manually set the mean parameters of the rest genes to be the same across all cells. We regard all genes with the same mean parameter across cells as non-DE genes.

diff <- apply(PANCREAS_para$mean_mat, 2, function(x){max(x)-min(x)})
diff_ordered <- order(diff, decreasing = TRUE)
diff <- diff[diff_ordered]
num_de <- 50
de_idx <- names(diff[1:num_de])
non_de_idx <- names(diff[-(1:num_de)])
non_de_mat <- apply(PANCREAS_para$mean_mat[,non_de_idx], 2, function(x){
  avg <- mean(x)
  new_mean <- rep(avg, length(x))
  return(new_mean)
})
PANCREAS_para$mean_mat[,non_de_idx] <- non_de_mat

set.seed(1)
 PANCREAS_newcount <- simu_new(
    sce = example_sce,
    mean_mat = PANCREAS_para$mean_mat,
    sigma_mat = PANCREAS_para$sigma_mat,
    zero_mat = PANCREAS_para$zero_mat,
    quantile_mat = NULL,
    copula_list = PANCREAS_copula$copula_list,
    n_cores = 1,
    family_use = "nb",
    input_data = PANCREAS_data$dat,
    new_covariate = PANCREAS_data$newCovariate,
    important_feature = PANCREAS_copula$important_feature,
    filtered_gene = PANCREAS_data$filtered_gene
  )

simu_sce <- SingleCellExperiment(list(counts = PANCREAS_newcount), colData = PANCREAS_data$newCovariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))

DE genes identification

Now, we use the simulated data to benchmark the performance of four DEG identification methods. The p-values from the two methods after Benjamini-Hochberg(BH) corretion will be stored in qvals in the following code.

qvals <- matrix(0, ncol = 4, nrow = dim(simu_sce)[1])
colnames(qvals) <- c("Monocle3-DE","tradeSeq","NBAMseq","PseudotimeDE")
rownames(qvals) <- rownames(simu_sce)

Monocle3

We follow the tutorial from Monocle3 to conduct the DE test and obtain the p-values after BH correction.

rowdata <- rowData(simu_sce)
rowdata$gene_short_name <- rownames(rowData(simu_sce))
coldata <- colData(simu_sce)
cds <- new_cell_data_set(counts(simu_sce),
                           cell_metadata = coldata,
                           gene_metadata = rowdata)
gene_fits <- fit_models(cds, model_formula_str = "~pseudotime")
fit_coefs <- coefficient_table(gene_fits)
fit <- fit_coefs %>% dplyr::filter(term == "pseudotime") %>% dplyr::select(p_value, q_value) %>% dplyr::mutate(gene = rownames(simu_sce))
fit
qvals[fit$gene,"Monocle3-DE"] <- fit$q_value

tradeSeq

We follow the tutorial from tradeSeq to conduct the DE test and obtain the p-values after BH correction.

pseudo <- colData(simu_sce)$pseudotime
icMat <- evaluateK(counts = counts(simu_sce), pseudotime  = pseudo, cellWeights = rep(1, dim(simu_sce)[2]), k = 3:10, nGenes = 100, verbose = FALSE, plot = TRUE)

res_tradeSeq <- fitGAM(counts = as.matrix(assays(simu_sce)$counts), pseudotime = pseudo, cellWeights = rep(1, length(pseudo)),nknots = 10)
assoRes <- associationTest(res_tradeSeq, lineages = FALSE)
assoRes <- assoRes %>% as_tibble(rownames = "gene") %>% dplyr::mutate(qvalue = p.adjust(pvalue, method = "BH"))
qvals[assoRes$gene,"tradeSeq"] <- assoRes$qvalue

NBAMseq

We follow the tutorial from NBAMseq to conduct the DE test and obtain the p-values after BH correction.

countdata <- counts(simu_sce)
coldata <- colData(simu_sce)
design = ~s(pseudotime)
gsd = NBAMSeqDataSet(countData = countdata, colData = coldata, design = design)
gsd = NBAMSeq(gsd) 
res1 = NBAMSeq::results(gsd, name = "pseudotime")
head(res1)
qvals[rownames(res1),"NBAMseq"] = res1$padj

PseudotimeDE

We follow the tutorial from PseudotimeDE to conduct the DE test and obtain the p-values after BH correction. Here to save computational time, we use the fix-mode of PseudotimeDE.

ori_tbl <- tibble(cell = colnames(simu_sce), pseudotime = colData(simu_sce)$pseudotime)
res <- runPseudotimeDE(gene.vec = rownames(simu_sce),
                                     ori.tbl = ori_tbl,
                                     sub.tbl = NULL, 
                                     mat = simu_sce, 
                                     model = "nb")
qvals[res$gene,"PseudotimeDE"] <- p.adjust(res$fix.pv, method = "BH", length(res$gene))

Since tradeSeq’s result contains some NA, we convert the NA to 1 first.

qvals[is.na(qvals)] <- 1
print(head(qvals))

test = colnames(qvals)
targetFDR <- c(seq(0.01,0.1,by=0.01),seq(0.2,0.5,by=0.1))
de <- colnames(PANCREAS_para$mean_mat[,de_idx])
fdp_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(fdp_mat) <- test
rownames(fdp_mat) <- targetFDR
power_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(power_mat) <- test
rownames(power_mat) <- targetFDR
for (t in 1:length(test)) {
  curr_p = qvals[,t]
  for (i in 1:length(targetFDR)) {
    thre <- targetFDR[i]
    discovery <- which(curr_p <= thre)
    tp <- length(intersect(names(discovery),de))
    if(length(discovery) == 0){
      fdp <- 0
    }else{
      fdp <- (length(discovery) - tp)/length(discovery)
    }
  
    power <- tp/length(de)
    fdp_mat[i, t] <- fdp
    power_mat[i,t] <- power
  }
}

Lastly, we visualize the Target FDR vs Actual FDP and Target FDR vs Power below.

fdp_long <- melt(fdp_mat)
colnames(fdp_long) <- c("Target FDR","test_method","Actual FDP")
fdp_plot <- ggplot(fdp_long) +
  geom_line(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_abline(intercept = 0, slope=1,linetype="dashed",color="grey")+ 
 theme(aspect.ratio = 1) + expand_limits(x = 0, y = c(0,1))
fdp_plot

power_long <- melt(power_mat)
colnames(power_long) <- c("Target FDR","test_method","Power")
power_plot <- ggplot(power_long) +
  geom_line(aes(x=`Target FDR`, y=Power,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=Power,color=test_method))+
  theme(aspect.ratio = 1)
power_plot

Identification of Spatially Variable Genes (SVG) in spatial transcriptomic data

Read in the reference data

The raw data is from the Seurat, which is a dataset generated with the Visium technology from 10x Genomics. We pre-select the top spatial variable genes.

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))
print(example_sce)
#> class: SingleCellExperiment 
#> dim: 1000 2696 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(1000): Calb2 Gng4 ... Fndc5 Gda
#> rowData names(0):
#> colnames(2696): AAACAAGTATCTCCCA-1 AAACACCAATAACTGC-1 ...
#>   TTGTTTCACATCCAGG-1 TTGTTTCCATACAACT-1
#> colData names(12): orig.ident nCount_Spatial ... spatial2 cell_type
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):

To save time, we subset the top 200 genes.

example_sce <- example_sce[1:200, ]

We remove the mitochondrial genes since this is a preprocessing step in SPARK-X, which is one of the method we will use to identify spatially variable genes.

mt_idx<- grep("mt-",rownames(example_sce))
if(length(mt_idx)!=0){
    example_sce   <- example_sce[-mt_idx,]
}

Simulation

set.seed(1)
example_data <- construct_data(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = c("spatial1", "spatial2"),
    other_covariates = NULL,
    corr_by = "1"
  )

example_marginal <- fit_marginal(
    data = example_data,
    predictor = "gene",
    mu_formula = "s(spatial1, spatial2, bs = 'gp', k= 50)",
    sigma_formula = "1",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE
  )

set.seed(1)
example_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = example_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = example_data$dat
  )

example_para <- extract_para(
    sce = example_sce,
    marginal_list = example_marginal,
    n_cores = 2,
    family_use = "nb",
    new_covariate = example_data$newCovariate,
    data = example_data$dat
  )

Here, we examine the mean_mat, which is one of the outputs from the previous function extract_para(). For each gene, we calculate the deviance explained by spatial locations in each regression model, and select the top 50. We regard these genes as spatially variable genes (SVGs). Then, we manually set the mean parameters of the rest genes to be the same across all cells. We regard all genes with the same mean parameter across cells as non-SVG genes.

dev_explain <- sapply(example_marginal, function(x){
  sum = summary(x$fit)
  return(sum$dev.expl)
})
dev_ordered <- order(dev_explain, decreasing = TRUE)
num_de <- 50
ordered <- dev_explain[dev_ordered]
de_idx <- names(ordered)[1:num_de]
non_de_idx <- names(ordered)[-(1:num_de)]
non_de_mat <- apply(example_para$mean_mat[,non_de_idx], 2, function(x){
  avg <- (max(x)+min(x))/2
  new_mean <- rep(avg, length(x))
  return(new_mean)
})
example_para$mean_mat[,non_de_idx] <- non_de_mat

Another way to select SVG based on Moran’s I.

# num_de <- 50
# loc = colData(simu_sce)[,c("spatial1","spatial2")]
# features = FindSpatiallyVariableFeatures(counts(simu_sce), spatial.location = loc, selection.method = "moransi",nfeatures = num_de)
# top.features = features[order(features$p.value),]
# top.features= rownames(top.features[1:num_de,])
# de_idx <- which(rownames(simu_sce) %in% top.features)
# non_de_idx <-which(!rownames(simu_sce) %in% top.features)
# non_de_mat <- apply(example_para$mean_mat[,non_de_idx], 2, function(x){
#   avg <- (max(x)+min(x))/2
#   new_mean <- rep(avg, length(x))
#   return(new_mean)
# })
# example_para$mean_mat[,non_de_idx] <- non_de_mat

set.seed(1)
 example_newcount <- simu_new(
    sce = example_sce,
    mean_mat = example_para$mean_mat,
    sigma_mat = example_para$sigma_mat,
    zero_mat = example_para$zero_mat,
    quantile_mat = NULL,
    copula_list = example_copula$copula_list,
    n_cores = 1,
    family_use = "nb",
    input_data = example_data$dat,
    new_covariate = example_data$newCovariate,
    important_feature = rep(TRUE, dim(example_sce)[1]),
    filtered_gene = example_data$filtered_gene
  )

simu_sce <- SingleCellExperiment(list(counts =example_newcount), colData = example_data$newCovariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))

We rescale the log count of 5 synthetic SVG here to the 0 to 1 scale and visualize them below.

de_genes = de_idx[1:5]
loc = colData(simu_sce)[,c("spatial1","spatial2")]
expre = lapply(de_genes, function(x){
    curr = as.matrix(counts(simu_sce)[x,])
    curr = log1p(curr)
    return(rescale(curr))
  })
long = do.call(rbind, expre)
long = as.data.frame(long)
colnames(long) <- "Expression"
long$gene = do.call(c, lapply(de_genes, function(x){rep(x,dim(expre[[1]])[1])}))
long$x = rep(loc[,1],5)
long$y = rep(loc[,2],5)
as_tibble(long, rownames = "Cell") %>% ggplot(aes(x = x, y = y, color = Expression)) +geom_point(size = 0.1)+facet_grid(~gene)+ scale_colour_gradientn(colors = viridis_pal(option = "magma")(10), limits=c(0, 1)) + coord_fixed(ratio = 1) + theme(axis.text.x = element_text(angle = 45))

We also rescale the log count of 5 synthetic non-SVG here to the 0 to 1 scale and visualize them below.

non_de_genes = non_de_idx[1:5]
loc = colData(simu_sce)[,c("spatial1","spatial2")]
expre = lapply(non_de_genes, function(x){
    curr = as.matrix(counts(simu_sce)[x,])
    curr = log1p(curr)
    return(rescale(curr))
  })
long = do.call(rbind, expre)
long = as.data.frame(long)
colnames(long) <- "Expression"
long$gene = do.call(c, lapply(non_de_genes, function(x){rep(x,dim(expre[[1]])[1])}))
long$x = rep(loc[,1],5)
long$y = rep(loc[,2],5)
as_tibble(long, rownames = "Cell") %>% ggplot(aes(x = x, y = y, color = Expression)) +geom_point(size = 0.1)+facet_grid(~gene)+ scale_colour_gradientn(colors = viridis_pal(option = "magma")(10), limits=c(0, 1)) + coord_fixed(ratio = 1) + theme(axis.text.x = element_text(angle = 45))

SVG identification

Now, we use the simulated data to benchmark the performance of three SVG identification methods. The p-values from the two methods after Benjamini-Hochberg(BH) correction will be stored in qvals in the following code.

qvals <- matrix(0, ncol = 3, nrow = dim(simu_sce)[1])
colnames(qvals) <- c("spatialDE","SPARK","SPARK-X")
rownames(qvals) <- rownames(simu_sce)

spatialDE

We follow the tutorial from spatialDE to conduct the hypothesis test and obtain the p-values after BH correction.

count <- counts(simu_sce)
sample_info <- colData(simu_sce)[, c("spatial1","spatial2")]
sample_info <- as.data.frame(sample_info)
colnames(sample_info) <- c("x","y")
count <- count[rowSums(count) >=3, ]
count <- count[, row.names(sample_info)]
sample_info$total_counts <- colSums(count)
X <- sample_info[,c("x","y")]
norm_expr <- stabilize(count)
resid_expr <- regress_out(norm_expr, sample_info = sample_info)
results_spatialDE <- spatialDE::run(resid_expr, coordinates = X)
rownames(results_spatialDE) <- results_spatialDE$g
qvals[,"spatialDE"] <- results_spatialDE[rownames(simu_sce),"qval"]

SPARK

We then follow the tutorial from SPARK to conduct the hypothesis test and obtain the p-values after BH correction.

rawcount <- counts(simu_sce)
info <- as.data.frame(colData(simu_sce)[, c("spatial1","spatial2")])
colnames(info) <- c("x","y")
info$total_count <- apply(rawcount,2,sum)
rownames(info) <- colnames(rawcount)
spark <- CreateSPARKObject(counts=rawcount, 
                             location=info[,1:2],
                             percentage = 0, 
                             min_total_counts = 10)
spark@lib_size <- apply(spark@counts, 2, sum)
spark <- spark.vc(spark, 
                   covariates = NULL, 
                   lib_size = spark@lib_size, 
                   num_core = 2,
                   verbose = FALSE)
spark <- spark.test(spark, 
                     check_positive = TRUE, 
                     verbose = FALSE)
qvals[,"SPARK"] <- spark@res_mtest$adjusted_pvalue

SPARK-X

We then follow the tutorial from SPARK-X to conduct the hypothesis test and obtain the p-values after BH correction.

sp_count <- counts(simu_sce)
info <- as.data.frame(colData(simu_sce)[, c("spatial1","spatial2")])
colnames(info) <- c("x","y")
info$total_count <- apply(sp_count ,2,sum)
rownames(info) <- colnames(sp_count )
location <- as.matrix(info)
mt_idx      <- grep("mt-",rownames(sp_count))
if(length(mt_idx)!=0){
    sp_count    <- sp_count[-mt_idx,]
}
sparkX <- sparkx(sp_count,location,numCores=1,option="mixture")
head(sparkX$res_mtest)
qvals[,"SPARK-X"] <-sparkX$res_mtest$adjustedPval

Since we manually created non-SVG in the extra_para() step, now we can calculate the actual false discovery proportion(FDP) and power of the tests we conducted above with various target FDR threshold.

test = colnames(qvals)
targetFDR <- c(0.01,0.05,0.1,0.2,0.5)
de <- de_idx
fdp_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(fdp_mat) <- test
rownames(fdp_mat) <- targetFDR
power_mat <- matrix(0, nrow = length(targetFDR), ncol = length(test))
colnames(power_mat) <- test
rownames(power_mat) <- targetFDR

for (t in 1:length(test)) {
  curr_p <- qvals[,t]
  curr_test <- test[t]
  for (i in 1:length(targetFDR)) {
    thre <- targetFDR[i]
    if(curr_test == "spatialDE"){
      de_results <- results_spatialDE[results_spatialDE$qval < thre, ]
      ms_results <- model_search(
      resid_expr,
      coordinates = X,
      de_results = de_results
      )
      discovery <- ms_results$g
      tp <- length(intersect(ms_results$g,de))
    }else{
      discovery <- which(curr_p <= thre)
      tp <- length(intersect(names(discovery),de))
    }
    if(length(discovery) == 0){
      fdp <- 0
    }else{
      fdp <- (length(discovery) - tp)/length(discovery)
    }
    power <- tp/length(de)
    fdp_mat[i, t] <- fdp
    power_mat[i,t] <- power
  }
}

Lastly, we visualize the Target FDR vs Actual FDP and Target FDR vs Power below.

fdp_long <- melt(fdp_mat)
colnames(fdp_long) <- c("Target FDR","test_method","Actual FDP")
fdp_plot <- ggplot(fdp_long) +
  geom_line(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=`Actual FDP`,color=test_method))+
  geom_abline(intercept = 0, slope=1,linetype="dashed",color="grey")+ 
 theme(aspect.ratio = 1) + expand_limits(x = 0, y = c(0,1))
fdp_plot

power_long <- melt(power_mat)
colnames(power_long) <- c("Target FDR","test_method","Power")
power_plot <- ggplot(power_long) +
  geom_line(aes(x=`Target FDR`, y=Power,color=test_method))+
  geom_point(aes(x=`Target FDR`, y=Power,color=test_method))+
  theme(aspect.ratio = 1)
power_plot

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] tidyr_1.3.0                 scales_1.2.1               
#>  [3] SPARK_1.1.1                 spatialDE_1.6.0            
#>  [5] reshape2_1.4.4              tradeSeq_1.14.0            
#>  [7] dplyr_1.1.2                 PseudotimeDE_1.0.0         
#>  [9] NBAMSeq_1.16.0              BiocParallel_1.34.2        
#> [11] DESeq2_1.40.2               monocle3_1.3.1             
#> [13] scran_1.28.1                scuttle_1.10.1             
#> [15] SeuratObject_4.1.3          Seurat_4.3.0.1             
#> [17] DuoClustering2018_1.18.0    ggplot2_3.4.2              
#> [19] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#> [21] Biobase_2.60.0              GenomicRanges_1.52.0       
#> [23] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#> [25] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#> [27] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [29] scDesign3_0.99.6            BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] fs_1.6.3                      spatstat.sparse_3.0-2        
#>   [3] bitops_1.0-7                  doParallel_1.0.17            
#>   [5] httr_1.4.6                    RColorBrewer_1.1-3           
#>   [7] backports_1.4.1               tools_4.3.1                  
#>   [9] sctransform_0.3.5             utf8_1.2.3                   
#>  [11] R6_2.5.1                      HDF5Array_1.28.1             
#>  [13] mgcv_1.9-0                    lazyeval_0.2.2               
#>  [15] uwot_0.1.16                   rhdf5filters_1.12.1          
#>  [17] withr_2.5.0                   sp_2.0-0                     
#>  [19] prettyunits_1.1.1             gridExtra_2.3                
#>  [21] progressr_0.13.0              cli_3.6.1                    
#>  [23] textshaping_0.3.6             spatstat.explore_3.2-1       
#>  [25] biglm_0.9-2.1                 labeling_0.4.2               
#>  [27] sass_0.4.7                    mvtnorm_1.2-2                
#>  [29] spatstat.data_3.0-1           genefilter_1.82.1            
#>  [31] gamlss_5.4-12                 ggridges_0.5.4               
#>  [33] pbapply_1.7-2                 slingshot_2.8.0              
#>  [35] pkgdown_2.0.7                 systemfonts_1.0.4            
#>  [37] R.utils_2.12.2                parallelly_1.36.0            
#>  [39] limma_3.56.2                  RSQLite_2.3.1                
#>  [41] generics_0.1.3                gamlss.data_6.0-2            
#>  [43] ica_1.0-3                     spatstat.random_3.1-5        
#>  [45] Matrix_1.6-0                  fansi_1.0.4                  
#>  [47] abind_1.4-5                   R.methodsS3_1.8.2            
#>  [49] terra_1.7-39                  lifecycle_1.0.3              
#>  [51] yaml_2.3.7                    edgeR_3.42.4                 
#>  [53] CompQuadForm_1.4.3            rhdf5_2.44.0                 
#>  [55] BiocFileCache_2.8.0           Rtsne_0.16                   
#>  [57] grid_4.3.1                    blob_1.2.4                   
#>  [59] promises_1.2.0.1              dqrng_0.3.0                  
#>  [61] ExperimentHub_2.8.1           crayon_1.5.2                 
#>  [63] dir.expiry_1.8.0              miniUI_0.1.1.1               
#>  [65] lattice_0.21-8                beachmat_2.16.0              
#>  [67] cowplot_1.1.1                 annotate_1.78.0              
#>  [69] KEGGREST_1.40.0               magick_2.7.4                 
#>  [71] pillar_1.9.0                  knitr_1.43                   
#>  [73] metapod_1.8.0                 rjson_0.2.21                 
#>  [75] boot_1.3-28.1                 future.apply_1.11.0          
#>  [77] codetools_0.2-19              leiden_0.4.3                 
#>  [79] glue_1.6.2                    data.table_1.14.8            
#>  [81] vctrs_0.6.3                   png_0.1-8                    
#>  [83] gtable_0.3.3                  assertthat_0.2.1             
#>  [85] cachem_1.0.8                  xfun_0.39                    
#>  [87] DropletUtils_1.20.0           princurve_2.1.6              
#>  [89] S4Arrays_1.0.4                mime_0.12                    
#>  [91] pracma_2.4.2                  survival_3.5-5               
#>  [93] iterators_1.0.14              statmod_1.5.0                
#>  [95] bluster_1.10.0                interactiveDisplayBase_1.38.0
#>  [97] ellipsis_0.3.2                fitdistrplus_1.1-11          
#>  [99] ROCR_1.0-11                   nlme_3.1-162                 
#> [101] bit64_4.0.5                   progress_1.2.2               
#> [103] filelock_1.0.2                RcppAnnoy_0.0.21             
#> [105] rprojroot_2.0.3               bslib_0.5.0                  
#> [107] irlba_2.3.5.1                 matlab_1.0.4                 
#> [109] KernSmooth_2.23-22            colorspace_2.1-0             
#> [111] DBI_1.1.3                     tidyselect_1.2.0             
#> [113] bit_4.0.5                     compiler_4.3.1               
#> [115] curl_5.0.1                    BiocNeighbors_1.18.0         
#> [117] basilisk.utils_1.12.1         desc_1.4.2                   
#> [119] DelayedArray_0.26.6           plotly_4.10.2                
#> [121] bookdown_0.34                 checkmate_2.2.0              
#> [123] lmtest_0.9-40                 speedglm_0.3-5               
#> [125] rappdirs_0.3.3                SpatialExperiment_1.10.0     
#> [127] stringr_1.5.0                 digest_0.6.33                
#> [129] goftest_1.2-3                 spatstat.utils_3.0-3         
#> [131] minqa_1.2.5                   rmarkdown_2.23               
#> [133] basilisk_1.12.1               XVector_0.40.0               
#> [135] htmltools_0.5.5               pkgconfig_2.0.3              
#> [137] lme4_1.1-34                   sparseMatrixStats_1.12.2     
#> [139] highr_0.10                    dbplyr_2.3.3                 
#> [141] fastmap_1.1.1                 rlang_1.1.1                  
#> [143] htmlwidgets_1.6.2             ggthemes_4.2.4               
#> [145] shiny_1.7.4.1                 DelayedMatrixStats_1.22.1    
#> [147] farver_2.1.1                  jquerylib_0.1.4              
#> [149] zoo_1.8-12                    jsonlite_1.8.7               
#> [151] mclust_6.0.0                  R.oo_1.25.0                  
#> [153] BiocSingular_1.16.0           RCurl_1.98-1.12              
#> [155] magrittr_2.0.3                GenomeInfoDbData_1.2.10      
#> [157] patchwork_1.1.2               Rhdf5lib_1.22.0              
#> [159] munsell_0.5.0                 Rcpp_1.0.11                  
#> [161] TrajectoryUtils_1.8.0         viridis_0.6.4                
#> [163] reticulate_1.30               stringi_1.7.12               
#> [165] zlibbioc_1.46.0               MASS_7.3-60                  
#> [167] MAST_1.26.0                   AnnotationHub_3.8.0          
#> [169] plyr_1.8.8                    gamlss.dist_6.0-5            
#> [171] listenv_0.9.0                 ggrepel_0.9.3                
#> [173] deldir_1.0-9                  Biostrings_2.68.1            
#> [175] splines_4.3.1                 tensor_1.5                   
#> [177] hms_1.1.3                     locfit_1.5-9.8               
#> [179] igraph_1.5.0.1                spatstat.geom_3.2-4          
#> [181] ScaledMatrix_1.8.1            XML_3.99-0.14                
#> [183] BiocVersion_3.17.1            evaluate_0.21                
#> [185] BiocManager_1.30.21.1         foreach_1.5.2                
#> [187] nloptr_2.0.3                  httpuv_1.6.11                
#> [189] RANN_2.6.1                    purrr_1.0.1                  
#> [191] polyclip_1.10-4               future_1.33.0                
#> [193] scattermore_1.2               rsvd_1.0.5                   
#> [195] xtable_1.8-4                  later_1.3.1                  
#> [197] viridisLite_0.4.2             ragg_1.2.5                   
#> [199] tibble_3.2.1                  memoise_2.0.1                
#> [201] AnnotationDbi_1.62.2          cluster_2.1.4                
#> [203] globals_0.16.2                here_1.0.1