Skip to contents

In this tutorial, we will demostrate how to simulate a new dataset using a one-shot function or a sequence of functions provided by our package.

Read in the reference data

The raw data is from the scvelo, which describes pancreatic endocrinogenesis. We pre-select the top 1000 highly variable genes and filter out some cell types to ensure a single trajectory.

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581992")))
print(example_sce)
#> class: SingleCellExperiment 
#> dim: 1000 2087 
#> metadata(5): clusters_coarse_colors clusters_colors day_colors
#>   neighbors pca
#> assays(6): X spliced ... cpm logcounts
#> rownames(1000): Pyy Iapp ... Eya2 Kif21a
#> rowData names(1): highly_variable_genes
#> colnames(2087): AAACCTGAGAGGGATA AAACCTGGTAAGTGGC ... TTTGTCAAGTGACATA
#>   TTTGTCAAGTGTGGCA
#> colData names(7): clusters_coarse clusters ... sizeFactor pseudotime
#> reducedDimNames(4): X_pca X_umap PCA UMAP
#> mainExpName: NULL
#> altExpNames(0):

To save computational time, we only use the top 100 genes.

example_sce <- example_sce[1:100, ]

Simulation

The function scdesign3() is a one-shot function that can generate new dataset.

set.seed(123)
example_simu <- scdesign3(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = "pseudotime",
    spatial = NULL,
    other_covariates = NULL,
    mu_formula = "s(pseudotime, k = 10, bs = 'cr')",
    sigma_formula = "s(pseudotime, k = 5, bs = 'cr')",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    corr_formula = "1",
    copula = "gaussian",
    DT = TRUE,
    pseudo_obs = FALSE,
    return_model = FALSE,
    nonzerovar = FALSE
  )

We create the SingleCellExperiment objects using the new count matrix generated above and store the logcounts.

logcounts(example_sce) <- log1p(counts(example_sce))
simu_sce <- SingleCellExperiment(list(counts = example_simu$new_count), colData = example_simu$new_covariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))

Visualization

set.seed(123)
compare_figure <- plot_reduceddim(ref_sce = example_sce, 
                                  sce_list = list(simu_sce), 
                                  name_vec = c("Reference", "scDesign3"),
                                  assay_use = "logcounts", 
                                  if_plot = TRUE, 
                                  color_by = "pseudotime", 
                                  n_pc = 20)
plot(compare_figure$p_umap)

Step-by-step functions

Alternatively, you can run through the following steps to generate the new dataset. The code below does exactly the same thing as the one-shot function above.

  1. Construct the input dataset.
PANCREAS_data <- construct_data(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = "pseudotime",
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
  )
  1. Fit regression models for each feature based on your specification.
PANCREAS_marginal <- fit_marginal(
    data = PANCREAS_data,
    predictor = "gene",
    mu_formula = "s(pseudotime, k = 10, bs = 'cr')",
    sigma_formula = "s(pseudotime, k = 5, bs = 'cr')",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE
  )
  1. Fit a copula, obtain AIC and BIC.
set.seed(123)
PANCREAS_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = PANCREAS_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = PANCREAS_data$dat,
    important_feature = "auto"
  )
  1. Extract out the estimated parameters so you can make some modifications and use the modified parameters to generate new data if needed. It can extract out the following parameters:
  • a cell-by-gene mean matrix
  • a sigma matrix which is:
    • a cell-by-gene matrix of \(\frac{1}{\phi}\) for negative binomial distribution
    • a cell-by-gene matrix of the standard deviation \(\sigma\) for Gaussian distribution
    • a cell-by-gene matrix of 1 for poisson distribution
  • a zero matrix which is:
    • a cell-by-gene matrix of zero probabilities for zero-inflated negative binomial and zero-inflated poisson distributions
    • a zero matrix for negative binomial, Gaussian, and poisson distributions
PANCREAS_para <- extract_para(
    sce = example_sce,
    marginal_list = PANCREAS_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = NULL,
    data = PANCREAS_data$dat
  )
  1. Simulate the new count matrix.
set.seed(123)
 PANCREAS_newcount <- simu_new(
    sce = example_sce,
    mean_mat = PANCREAS_para$mean_mat,
    sigma_mat = PANCREAS_para$sigma_mat,
    zero_mat = PANCREAS_para$zero_mat,
    quantile_mat = NULL,
    copula_list = PANCREAS_copula$copula_list,
    n_cores = 1,
    family_use = "nb",
    input_data = PANCREAS_data$dat,
    new_covariate = PANCREAS_data$newCovariate,
    important_feature = PANCREAS_copula$important_feature,
    filtered_gene = PANCREAS_data$filtered_gene
  )

Then, we can create the SinglecellExperiment object using the synthetic count matrix and store the logcounts to the input and synthetic SinglecellExperiment objects.

simu_sce <- SingleCellExperiment(list(counts = PANCREAS_newcount), colData = PANCREAS_data$newCovariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))

Visualization

set.seed(123)
compare_figure <- plot_reduceddim(ref_sce = example_sce, 
                                  sce_list = list(simu_sce), 
                                  name_vec = c("Reference", "scDesign3"),
                                  assay_use = "logcounts", 
                                  if_plot = TRUE, 
                                  color_by = "pseudotime", 
                                  n_pc = 20)
plot(compare_figure$p_umap)

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#>  [3] Biobase_2.60.0              GenomicRanges_1.52.0       
#>  [5] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#>  [7] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#>  [9] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [11] ggplot2_3.4.2               scDesign3_0.99.6           
#> [13] BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        viridisLite_0.4.2       farver_2.1.1           
#>  [4] dplyr_1.1.2             viridis_0.6.4           bitops_1.0-7           
#>  [7] fastmap_1.1.1           RCurl_1.98-1.12         digest_0.6.33          
#> [10] lifecycle_1.0.3         survival_3.5-5          gamlss.dist_6.0-5      
#> [13] magrittr_2.0.3          compiler_4.3.1          rlang_1.1.1            
#> [16] sass_0.4.7              tools_4.3.1             utf8_1.2.3             
#> [19] yaml_2.3.7              knitr_1.43              labeling_0.4.2         
#> [22] askpass_1.1             S4Arrays_1.0.4          mclust_6.0.0           
#> [25] reticulate_1.30         DelayedArray_0.26.6     withr_2.5.0            
#> [28] purrr_1.0.1             desc_1.4.2              grid_4.3.1             
#> [31] fansi_1.0.4             colorspace_2.1-0        scales_1.2.1           
#> [34] MASS_7.3-60             cli_3.6.1               mvtnorm_1.2-2          
#> [37] rmarkdown_2.23          crayon_1.5.2            ragg_1.2.5             
#> [40] generics_0.1.3          umap_0.2.10.0           RSpectra_0.16-1        
#> [43] cachem_1.0.8            stringr_1.5.0           zlibbioc_1.46.0        
#> [46] splines_4.3.1           parallel_4.3.1          BiocManager_1.30.21.1  
#> [49] XVector_0.40.0          vctrs_0.6.3             Matrix_1.6-0           
#> [52] jsonlite_1.8.7          bookdown_0.34           gamlss_5.4-12          
#> [55] irlba_2.3.5.1           systemfonts_1.0.4       jquerylib_0.1.4        
#> [58] glue_1.6.2              pkgdown_2.0.7           stringi_1.7.12         
#> [61] gtable_0.3.3            munsell_0.5.0           tibble_3.2.1           
#> [64] pillar_1.9.0            htmltools_0.5.5         openssl_2.1.0          
#> [67] gamlss.data_6.0-2       GenomeInfoDbData_1.2.10 R6_2.5.1               
#> [70] textshaping_0.3.6       rprojroot_2.0.3         evaluate_0.21          
#> [73] lattice_0.21-8          highr_0.10              png_0.1-8              
#> [76] memoise_2.0.1           bslib_0.5.0             Rcpp_1.0.11            
#> [79] gridExtra_2.3           nlme_3.1-162            mgcv_1.9-0             
#> [82] xfun_0.39               fs_1.6.3                pkgconfig_2.0.3