Skip to contents
library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
library(DuoClustering2018)
library(scran)
library(tidyverse)
theme_set(theme_bw())

Introduction

In this tutorial, we will show how to use scDesign3 to simulate datasets adjusted by cell library size. The purpose of this tutorial is to show that including the library size when modeling the marginal distribution for each gene can help cells in the synthetic data have more similar library sizes as the cells in the real data.

Read in the reference data

The raw data is from the R package DuoClustering2018 which contain a set of datasets with true cell type labels.

sce <- get("sce_filteredExpr10_Zhengmix4eq")(metadata = FALSE)
colData(sce)$cell_type = as.factor(colData(sce)$phenoid)

We then calculate the library size for each cell.

colData(sce)$library = colSums(counts(sce))

Simulation

Then, we set the mu_formula as cell_type and offsetted by the cell library size to generate new dataset adjusted by library size. The library size is log-transformed because the link function for \(\mu\) of the negative binomial distribution in GAMLSS is \(\log\).

set.seed(123)
example_simu <- scdesign3(
    sce = sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = "library",
    mu_formula = "cell_type + offset(log(library))",
    sigma_formula = "1",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    corr_formula = "1",
    copula = "gaussian",
    DT = TRUE,
    pseudo_obs = FALSE,
    return_model = FALSE,
    nonzerovar = FALSE,
    parallelization = "pbmcmapply",
    important_feature = "auto"
  )

Then, we can create the SinglecellExperiment object using the synthetic count matrix and store the logcounts to the input and synthetic SinglecellExperiment objects.

logcounts(sce) <- log1p(counts(sce))
simu_sce <- SingleCellExperiment(list(counts = example_simu$new_count), colData = example_simu$new_covariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))
colData(simu_sce)$library <- colSums(counts(simu_sce))

Visualization

set.seed(123)
compare_figure <- plot_reduceddim(ref_sce = sce, 
                                  sce_list = list(simu_sce), 
                                  name_vec = c("Reference", "scDesign3"),
                                  assay_use = "logcounts", 
                                  if_plot = TRUE, 
                                  color_by = "cell_type", 
                                  n_pc = 20)
plot(compare_figure$p_umap)

The violin plot below shows the cells in simulated dataset have similar library size as the cells in the reference dataset.

df1 = colData(sce) %>% as_tibble() %>% select(library) %>% mutate(Method = "Reference")
df2 = colData(simu_sce) %>% as_tibble()  %>% select(library) %>% mutate(Method = "scDesign3")
df = rbind(df1,df2)
ggplot(df, aes(x = Method, y = library, color = Method)) +
     geom_violin() + theme(aspect.ratio=1)

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] lubridate_1.9.2             forcats_1.0.0              
#>  [3] stringr_1.5.0               dplyr_1.1.2                
#>  [5] purrr_1.0.1                 readr_2.1.4                
#>  [7] tidyr_1.3.0                 tibble_3.2.1               
#>  [9] tidyverse_2.0.0             scran_1.28.1               
#> [11] scuttle_1.10.1              DuoClustering2018_1.18.0   
#> [13] ggplot2_3.4.2               SingleCellExperiment_1.22.0
#> [15] SummarizedExperiment_1.30.2 Biobase_2.60.0             
#> [17] GenomicRanges_1.52.0        GenomeInfoDb_1.36.1        
#> [19] IRanges_2.34.1              S4Vectors_0.38.1           
#> [21] BiocGenerics_0.46.0         MatrixGenerics_1.12.2      
#> [23] matrixStats_1.0.0           scDesign3_0.99.6           
#> [25] BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] jsonlite_1.8.7                umap_0.2.10.0                
#>   [3] magrittr_2.0.3                farver_2.1.1                 
#>   [5] rmarkdown_2.23                fs_1.6.3                     
#>   [7] zlibbioc_1.46.0               ragg_1.2.5                   
#>   [9] vctrs_0.6.3                   memoise_2.0.1                
#>  [11] DelayedMatrixStats_1.22.1     RCurl_1.98-1.12              
#>  [13] askpass_1.1                   htmltools_0.5.5              
#>  [15] S4Arrays_1.0.4                AnnotationHub_3.8.0          
#>  [17] curl_5.0.1                    BiocNeighbors_1.18.0         
#>  [19] sass_0.4.7                    bslib_0.5.0                  
#>  [21] desc_1.4.2                    plyr_1.8.8                   
#>  [23] cachem_1.0.8                  igraph_1.5.0.1               
#>  [25] mime_0.12                     lifecycle_1.0.3              
#>  [27] pkgconfig_2.0.3               rsvd_1.0.5                   
#>  [29] Matrix_1.6-0                  R6_2.5.1                     
#>  [31] fastmap_1.1.1                 GenomeInfoDbData_1.2.10      
#>  [33] shiny_1.7.4.1                 digest_0.6.33                
#>  [35] colorspace_2.1-0              AnnotationDbi_1.62.2         
#>  [37] rprojroot_2.0.3               RSpectra_0.16-1              
#>  [39] dqrng_0.3.0                   irlba_2.3.5.1                
#>  [41] ExperimentHub_2.8.1           textshaping_0.3.6            
#>  [43] RSQLite_2.3.1                 beachmat_2.16.0              
#>  [45] labeling_0.4.2                filelock_1.0.2               
#>  [47] timechange_0.2.0              fansi_1.0.4                  
#>  [49] mgcv_1.9-0                    httr_1.4.6                   
#>  [51] compiler_4.3.1                bit64_4.0.5                  
#>  [53] withr_2.5.0                   BiocParallel_1.34.2          
#>  [55] viridis_0.6.4                 DBI_1.1.3                    
#>  [57] highr_0.10                    MASS_7.3-60                  
#>  [59] openssl_2.1.0                 rappdirs_0.3.3               
#>  [61] DelayedArray_0.26.6           bluster_1.10.0               
#>  [63] tools_4.3.1                   interactiveDisplayBase_1.38.0
#>  [65] httpuv_1.6.11                 glue_1.6.2                   
#>  [67] nlme_3.1-162                  promises_1.2.0.1             
#>  [69] grid_4.3.1                    cluster_2.1.4                
#>  [71] reshape2_1.4.4                generics_0.1.3               
#>  [73] gtable_0.3.3                  tzdb_0.4.0                   
#>  [75] hms_1.1.3                     metapod_1.8.0                
#>  [77] BiocSingular_1.16.0           ScaledMatrix_1.8.1           
#>  [79] utf8_1.2.3                    XVector_0.40.0               
#>  [81] BiocVersion_3.17.1            pillar_1.9.0                 
#>  [83] limma_3.56.2                  later_1.3.1                  
#>  [85] splines_4.3.1                 BiocFileCache_2.8.0          
#>  [87] lattice_0.21-8                survival_3.5-5               
#>  [89] bit_4.0.5                     gamlss.data_6.0-2            
#>  [91] tidyselect_1.2.0              locfit_1.5-9.8               
#>  [93] Biostrings_2.68.1             knitr_1.43                   
#>  [95] gridExtra_2.3                 bookdown_0.34                
#>  [97] edgeR_3.42.4                  xfun_0.39                    
#>  [99] statmod_1.5.0                 stringi_1.7.12               
#> [101] yaml_2.3.7                    evaluate_0.21                
#> [103] codetools_0.2-19              BiocManager_1.30.21.1        
#> [105] cli_3.6.1                     reticulate_1.30              
#> [107] pbmcapply_1.5.1               xtable_1.8-4                 
#> [109] systemfonts_1.0.4             munsell_0.5.0                
#> [111] jquerylib_0.1.4               Rcpp_1.0.11                  
#> [113] dbplyr_2.3.3                  png_0.1-8                    
#> [115] parallel_4.3.1                ellipsis_0.3.2               
#> [117] pkgdown_2.0.7                 blob_1.2.4                   
#> [119] mclust_6.0.0                  sparseMatrixStats_1.12.2     
#> [121] bitops_1.0-7                  gamlss.dist_6.0-5            
#> [123] mvtnorm_1.2-2                 viridisLite_0.4.2            
#> [125] ggthemes_4.2.4                scales_1.2.1                 
#> [127] gamlss_5.4-12                 crayon_1.5.2                 
#> [129] rlang_1.1.1                   KEGGREST_1.40.0