Perform ClusterDE on a single-domain spatial dataset

#tools::R_user_dir("ClusterDE", which="cache")
library(Seurat)
library(SingleCellExperiment)
library(spatialLIBD)
library(ggplot2)
library(ClusterDE)
library(BayesSpace)
library(dplyr)
library(scales)

Download data

We selected layer 5 of 151673 slice from the LIBD Human Dorsolateral Prefrontal Cortex (DLPFC) dataset, which is downloaded in the spatialLIBD R package. Since the data contains only one domain, there should not have between domains marker genes.

## Download the spot-level data
spe <- fetch_data(type = "spe")
#> adding rname 'https://www.dropbox.com/s/f4wcvtdq428y73p/Human_DLPFC_Visium_processedData_sce_scran_spatialLIBD.Rdata?dl=1'
#> 2024-09-17 00:27:47.947728 loading file /home/runner/.cache/R/BiocFileCache/82ee129a7fa4_Human_DLPFC_Visium_processedData_sce_scran_spatialLIBD.Rdata%3Fdl%3D1

###select the Layer5 in the slice 151673
sub_151673 <- spe[, spe$sample_id == "151673"]
index <- sub_151673$spatialLIBD == "L5"
index[which(is.na(index))]="NAN"
sub_151673 <- sub_151673[, index=="TRUE"]
print(sub_151673)
#> class: SpatialExperiment 
#> dim: 33538 673 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
#>   ENSG00000268674
#> rowData names(9): source type ... gene_search is_top_hvg
#> colnames(673): AAACAGCTTTCAGAAG-1 AAACCTCATGAAGTTG-1 ...
#>   TTGTTCAGTGTGCTAC-1 TTGTTGTGTGTCAAGA-1
#> colData names(69): sample_id Cluster ... array_row array_col
#> reducedDimNames(6): PCA TSNE_perplexity50 ... TSNE_perplexity80
#>   UMAP_neighbors15
#> mainExpName: NULL
#> altExpNames(0):
#> spatialCoords names(2) : pxl_col_in_fullres pxl_row_in_fullres
#> imgData names(4): sample_id image_id data scaleFactor

#delete the genes that express rate less than 20%
data <- sub_151673@assays@data$counts
zero_expre_rate <- apply(data,1,function(x){
  zero_true<-x==0
  zero_num<-length(which(zero_true==TRUE))/dim(data)[2]
  return(zero_num)
})
zero_expre_gene_idx <- which(zero_expre_rate<0.8)
print(paste0("the number of gene with zero expression rate: ",sum(zero_expre_rate==1),sep=""))
#> [1] "the number of gene with zero expression rate: 14846"
sub_151673 <- sub_151673[zero_expre_gene_idx,]
print(paste0("the size of data: ",dim(sub_151673)[1],"*",dim(sub_151673)[2],sep=""))
#> [1] "the size of data: 3970*673"

###construct the SingleCellExperiment object
real_sce <- SingleCellExperiment(list(counts=sub_151673@assays@data$counts))
###add colData information of singlecellexperiment
real_sce$spatial1 <- sub_151673@int_colData@listData$spatialCoords[,2]
real_sce$spatial2 <- sub_151673@int_colData@listData$spatialCoords[,1]
real_sce$cell_type <- sub_151673@colData$spatialLIBD
###visualize the real spatial domains
real_sce_domains <- data.frame(Xaxis=real_sce$spatial1,Yaxis=real_sce$spatial2,Domain=real_sce$cell_type)
ggplot(real_sce_domains, aes(x = Xaxis, y = Yaxis, col =Domain)) + geom_point(size=1.0) +
  ggtitle("Manual annotation \n (The layer5 in the slice 151673)") + 
  theme(plot.title = element_text(size=10,hjust=0.5),
        panel.grid=element_blank(),
        panel.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "black", fill = NA, size = 0.6),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank()) + scale_color_manual(values = c("#5791cc"))
#> Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.

Visualize the real data.

Run the BayesSpace + Seurat pipeline

Firstly, we employed the BayesSpace for spatial clustering. Please note that ClusterDE is designed for 1 vs 1 comparison; therefore, we obtain two spatial clusters for illustration purpose.

###construct the input of BayesSpace based on real dataset
#the input of BayesSpace is sce object
real_sce_clu <- SingleCellExperiment(list(counts=real_sce@assays@data$counts))
# add colData information of singlecellexperiment
real_sce_clu$row <- real_sce$spatial1
real_sce_clu$col <- real_sce$spatial2
## log-normalize the count data
set.seed(102)
real_sce_clu <- spatialPreprocess(real_sce_clu, platform="ST", n.PCs=7,log.normalize=TRUE)
#clustering with BayesSpace
set.seed(149)
real_sce_clu <- spatialCluster(real_sce_clu, q=2, platform="ST", d=7,
                               init.method="mclust", model="t", gamma=2,
                               nrep=1000, burn.in=100,
                               save.chain=TRUE)
#> Neighbors were identified for 0 out of 673 spots.
#> Fitting model...
#> Calculating labels using iterations 100 through 1000.
#visualize the spatial cluster
real_sce_spatial_clu <- data.frame(Xaxis=real_sce_clu$row,Yaxis=real_sce_clu$col,Clusters=as.character(real_sce_clu$spatial.cluster))
ggplot(real_sce_spatial_clu, aes(x = Xaxis, y = Yaxis, col =Clusters)) + geom_point(size=1.0) +
  ggtitle("Real data \n (spatial clusters detected by BayesSpace)") + 
  theme(plot.title = element_text(size=10,hjust=0.5),
        panel.grid = element_blank(),
        panel.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "black", fill = NA, size = 0.6),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank()) + scale_color_manual(values = c("#e87d72","#54bcc2"))

Visualize the spatial clustering results based on the real data

Then, we used the common DE method (Wilcoxon Rank Sum Test) to identify domain marker genes between the two spatial clusters.

#Identify domain marker genes in the real dataset based on the BayesSpace clustering result, follow Seurat tutorial
#create Seurat object
real_count_dataset <- real_sce@assays@data$counts
real_seurat <- CreateSeuratObject(counts = real_count_dataset, project = "real_seurat", min.cells = 0, min.features = 0)
real_seurat[["percent.mt"]] <- PercentageFeatureSet(real_seurat, pattern = "^MT-")
real_seurat <- NormalizeData(real_seurat, normalization.method = "LogNormalize", scale.factor = 10000)
#> Normalizing layer: counts
real_seurat <- ScaleData(real_seurat, features = rownames(real_seurat))
#> Centering and scaling data matrix
real_ct <-real_sce_clu$spatial.cluster
names(real_ct) <- colnames(real_sce)
real_seurat[["cell_type"]] <- real_ct
Idents(real_seurat) <- "cell_type"
#Then we follow seurat tutorial to conduct DE analysis
real_markers <- FindMarkers(object = real_seurat, ident.1 = unique(real_ct)[1], ident.2 =  unique(real_ct)[2], test.use = "wilcox", logfc.threshold = 0, min.pct = 0, min.cells.feature = 1, min.cells.group = 1)
#> For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
#> (default method for FindMarkers) please install the presto package
#> --------------------------------------------
#> install.packages('devtools')
#> devtools::install_github('immunogenomics/presto')
#> --------------------------------------------
#> After installation of presto, Seurat will automatically use the more 
#> efficient implementation (no further action necessary).
#> This message will be shown once per session
real_pvals <- real_markers[rownames(real_seurat),"p_val"]
names(real_pvals) <- rownames(real_seurat)

Generate synthetic null data

We first generate the synthetic null data based on the target data (real data). You can increase the number of cores to speed it up using the parameter “nCores”.

###generate synthetic null data
count_mat <- real_sce@assays@data$counts
location_mat <- data.frame(X=real_sce$spatial1,Y=real_sce$spatial2)
system.time(null_onedomain_dataset <- ClusterDE::constructNull(mat=count_mat,
                                                   family = "nb",
                                                   extraInfo=location_mat,
                                                   formula = "s(X, Y, bs = 'gp', k= 4)",
                                                   nCores = 2))
#> 100% of genes are used in correlation modelling.
#>    user  system elapsed 
#>  49.055  34.221  23.579

We perform the same pipeline as we did for target data. Please note we need two spatial clusters here, too.

###construct the input of BayesSpace based on null dataset######
null_sce <- SingleCellExperiment(list(counts =null_onedomain_dataset))
# add colData information of singlecellexperiment
null_sce$row <- real_sce$spatial1
null_sce$col <- real_sce$spatial2
## log-normalize the count data
set.seed(102)
null_sce<- spatialPreprocess(null_sce, platform="ST", n.PCs=7,log.normalize=TRUE)
#clustering with BayesSpace
set.seed(149)
null_sce <- spatialCluster(null_sce, q=2, platform="ST", d=7,
                           init.method="mclust", model="t", gamma=2,
                           nrep=1000, burn.in=100,
                           save.chain=TRUE)
#> Neighbors were identified for 0 out of 673 spots.
#> Fitting model...
#> Calculating labels using iterations 100 through 1000.
#visualize the spatial cluster
null_spatial_clu=data.frame(Xaxis=null_sce$row,Yaxis=null_sce$col,Clusters=as.character(null_sce$spatial.cluster))
ggplot(null_spatial_clu, aes(x = Xaxis, y = Yaxis, col =Clusters))  + geom_point(size=1.0) +
  ggtitle("Synthetic null data \n (spatial clusters detected by BayesSpace)") + 
  theme(plot.title = element_text(size=10,hjust=0.5),
        panel.grid=element_blank(),
        panel.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "black", fill = NA, size = 0.6),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank()) + scale_color_manual(values = c("#e87d72","#54bcc2"))

Visualize the spatial clustering results based on the synthetic null data.

Perform the common DE method (Wilcoxon Rank Sum Test) on synthetic null data

#Identify domain marker genes in the synthetic null dataset based on the BayesSpace clustering result, follow Seurat tutorial
#create Seurat object
null_count_dataset= null_onedomain_dataset
null_seurat <- CreateSeuratObject(counts = null_count_dataset, project = "null_seurat", min.cells = 0, min.features = 0)
null_seurat[["percent.mt"]] <- PercentageFeatureSet(null_seurat, pattern = "^MT-")
null_seurat <- NormalizeData(null_seurat, normalization.method = "LogNormalize", scale.factor = 10000)
#> Normalizing layer: counts
null_seurat <- ScaleData(null_seurat, features = rownames(null_seurat))
#> Centering and scaling data matrix
null_ct <- null_sce$spatial.cluster
names(null_ct) <- colnames(null_sce)
null_seurat[["cell_type"]] <- null_ct
Idents(null_seurat) <- "cell_type"
#Then we follow seurat tutorial to conduct DE analysis
null_markers <- FindMarkers(object = null_seurat, ident.1 = unique(null_ct)[1], ident.2 =  unique(null_ct)[2], test.use = "wilcox",logfc.threshold = 0, min.pct = 0, min.cells.feature = 1, min.cells.group = 1)
null_pvals <- null_markers[rownames(null_seurat),"p_val"]
names(null_pvals) <- rownames(null_seurat)

We extract the p-values from both target data and synthetic null data, then use ClusterDE to “compare” them. We do not discover any DE genes.

##run clusterDE to find real marker genes
###you can set the value of FDR, the default value is 0.05
res <- ClusterDE::callDE(real_pvals, null_pvals, nlogTrans = TRUE)
print(paste0("The number of domain marker genes is: ",length(res$DEgenes),sep=""))
#> [1] "The number of domain marker genes is: 0"

We can also visualize the distribution of contrast scores (diff between the -log p-values from real and null). It is roughly symmetric around 0.

ggplot(data = res$summaryTable, aes(x = cs)) + geom_histogram(fill = "white", color = "black") + theme_bw() + ggtitle("Distribution of constrast scores")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Session information

sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] scales_1.3.0                dplyr_1.1.4                
#>  [3] BayesSpace_1.14.0           ClusterDE_0.99.3           
#>  [5] ggplot2_3.5.1               spatialLIBD_1.16.2         
#>  [7] SpatialExperiment_1.14.0    SingleCellExperiment_1.26.0
#>  [9] SummarizedExperiment_1.34.0 Biobase_2.64.0             
#> [11] GenomicRanges_1.56.1        GenomeInfoDb_1.40.1        
#> [13] IRanges_2.38.1              S4Vectors_0.42.1           
#> [15] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
#> [17] matrixStats_1.4.1           Seurat_5.1.0               
#> [19] SeuratObject_5.0.2          sp_2.1-4                   
#> [21] BiocStyle_2.32.1           
#> 
#> loaded via a namespace (and not attached):
#>   [1] DirichletReg_0.7-1        goftest_1.2-3            
#>   [3] DT_0.33                   Biostrings_2.72.1        
#>   [5] vctrs_0.6.5               spatstat.random_3.3-1    
#>   [7] digest_0.6.37             png_0.1-8                
#>   [9] ggrepel_0.9.6             deldir_2.0-4             
#>  [11] parallelly_1.38.0         magick_2.8.4             
#>  [13] MASS_7.3-60.2             pkgdown_2.1.0            
#>  [15] reshape2_1.4.4            httpuv_1.6.15            
#>  [17] foreach_1.5.2             withr_3.0.1              
#>  [19] xfun_0.47                 survival_3.6-4           
#>  [21] memoise_2.0.1             benchmarkme_1.0.8        
#>  [23] ggbeeswarm_0.7.2          systemfonts_1.1.0        
#>  [25] ragg_1.3.3                zoo_1.8-12               
#>  [27] pbapply_1.7-2             Formula_1.2-5            
#>  [29] rematch2_2.1.2            KEGGREST_1.44.1          
#>  [31] promises_1.3.0            httr_1.4.7               
#>  [33] restfulr_0.0.15           randtoolbox_2.0.4        
#>  [35] globals_0.16.3            fitdistrplus_1.2-1       
#>  [37] rhdf5filters_1.16.0       rhdf5_2.48.0             
#>  [39] UCSC.utils_1.0.0          miniUI_0.1.1.1           
#>  [41] generics_0.1.3            curl_5.2.2               
#>  [43] fields_16.2               zlibbioc_1.50.0          
#>  [45] ScaledMatrix_1.12.0       polyclip_1.10-7          
#>  [47] GenomeInfoDbData_1.2.12   ExperimentHub_2.12.0     
#>  [49] SparseArray_1.4.8         golem_0.5.1              
#>  [51] xtable_1.8-4              stringr_1.5.1            
#>  [53] desc_1.4.3                doParallel_1.0.17        
#>  [55] evaluate_0.24.0           S4Arrays_1.4.1           
#>  [57] BiocFileCache_2.12.0      bookdown_0.40            
#>  [59] irlba_2.3.5.1             colorspace_2.1-1         
#>  [61] filelock_1.0.3            ROCR_1.0-11              
#>  [63] reticulate_1.39.0         spatstat.data_3.1-2      
#>  [65] shinyWidgets_0.8.6        magrittr_2.0.3           
#>  [67] lmtest_0.9-40             later_1.3.2              
#>  [69] viridis_0.6.5             lattice_0.22-6           
#>  [71] spatstat.geom_3.3-2       future.apply_1.11.2      
#>  [73] scattermore_1.2           XML_3.99-0.17            
#>  [75] scuttle_1.14.0            cowplot_1.1.3            
#>  [77] RcppAnnoy_0.0.22          pillar_1.9.0             
#>  [79] nlme_3.1-164              iterators_1.0.14         
#>  [81] compiler_4.4.1            beachmat_2.20.0          
#>  [83] RSpectra_0.16-2           stringi_1.8.4            
#>  [85] tensor_1.5                GenomicAlignments_1.40.0 
#>  [87] plyr_1.8.9                crayon_1.5.3             
#>  [89] abind_1.4-8               BiocIO_1.14.0            
#>  [91] scater_1.32.1             locfit_1.5-9.10          
#>  [93] bit_4.0.5                 sandwich_3.1-1           
#>  [95] codetools_0.2-20          textshaping_0.4.0        
#>  [97] BiocSingular_1.20.0       rngWELL_0.10-9           
#>  [99] coop_0.6-3                bslib_0.8.0              
#> [101] paletteer_1.6.0           plotly_4.10.4            
#> [103] mime_0.12                 splines_4.4.1            
#> [105] Rcpp_1.0.13               fastDummies_1.7.4        
#> [107] dbplyr_2.5.0              sparseMatrixStats_1.16.0 
#> [109] attempt_0.3.1             maxLik_1.5-2.1           
#> [111] knitr_1.48                blob_1.2.4               
#> [113] utf8_1.2.4                BiocVersion_3.19.1       
#> [115] fs_1.6.4                  listenv_0.9.1            
#> [117] DelayedMatrixStats_1.26.0 tibble_3.2.1             
#> [119] Matrix_1.7-0              statmod_1.5.0            
#> [121] pkgconfig_2.0.3           tools_4.4.1              
#> [123] cachem_1.1.0              RSQLite_2.3.7            
#> [125] viridisLite_0.4.2         DBI_1.2.3                
#> [127] fastmap_1.2.0             rmarkdown_2.28           
#> [129] grid_4.4.1                ica_1.0-3                
#> [131] Rsamtools_2.20.0          AnnotationHub_3.12.0     
#> [133] sass_0.4.9                patchwork_1.2.0          
#> [135] coda_0.19-4.1             BiocManager_1.30.25      
#> [137] dotCall64_1.1-1           RANN_2.6.2               
#> [139] farver_2.1.2              yaml_2.3.10              
#> [141] kde1d_1.0.7               rtracklayer_1.64.0       
#> [143] cli_3.6.3                 purrr_1.0.2              
#> [145] leiden_0.4.3.1            lifecycle_1.0.4          
#> [147] uwot_0.2.2                bluster_1.14.0           
#> [149] sessioninfo_1.2.2         BiocParallel_1.38.0      
#> [151] gtable_0.3.5              rjson_0.2.22             
#> [153] ggridges_0.5.6            progressr_0.14.0         
#> [155] parallel_4.4.1            limma_3.60.4             
#> [157] jsonlite_1.8.8            edgeR_4.2.1              
#> [159] miscTools_0.6-28          RcppHNSW_0.6.0           
#> [161] rvinecopulib_0.6.3.1.1    bitops_1.0-8             
#> [163] benchmarkmeData_1.0.4     bit64_4.0.5              
#> [165] assertthat_0.2.1          xgboost_1.7.8.1          
#> [167] Rtsne_0.17                spatstat.utils_3.1-0     
#> [169] BiocNeighbors_1.22.0      jquerylib_0.1.4          
#> [171] highr_0.11                metapod_1.12.0           
#> [173] config_0.3.2              dqrng_0.4.1              
#> [175] spatstat.univar_3.0-1     lazyeval_0.2.2           
#> [177] shiny_1.9.1               htmltools_0.5.8.1        
#> [179] sctransform_0.4.1         rappdirs_0.3.3           
#> [181] glue_1.7.0                spam_2.10-0              
#> [183] XVector_0.44.0            RCurl_1.98-1.16          
#> [185] scran_1.32.0              mclust_6.1.1             
#> [187] mvnfast_0.2.8             gridExtra_2.3            
#> [189] igraph_2.0.3              R6_2.5.1                 
#> [191] tidyr_1.3.1               labeling_0.4.3           
#> [193] cluster_2.1.6             Rhdf5lib_1.26.0          
#> [195] DelayedArray_0.30.1       tidyselect_1.2.1         
#> [197] vipor_0.4.7               maps_3.4.2               
#> [199] AnnotationDbi_1.66.0      future_1.34.0            
#> [201] rsvd_1.0.5                munsell_0.5.1            
#> [203] KernSmooth_2.23-24        data.table_1.16.0        
#> [205] htmlwidgets_1.6.4         RColorBrewer_1.1-3       
#> [207] rlang_1.1.4               spatstat.sparse_3.1-0    
#> [209] spatstat.explore_3.3-2    fansi_1.0.6              
#> [211] beeswarm_0.4.0

Siqi Chen

Dongyuan Song

17 September 2024

Download data

Run the BayesSpace + Seurat pipeline

Generate synthetic null data

Session information