Skip to contents

Introduction

In this tutorial, we will show how to use scDesign3 to simulate multi-omics (RNA expression + DNA methylation) data by learning from real data that only have a single modality. The example data and aligned low-dimensional embeddings are from Pamona.

Read in the reference data

SCGEMMETH_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581998")))
SCGEMRNA_sce <-readRDS((url("https://figshare.com/ndownloader/files/40582001")))
print(SCGEMMETH_sce)
print(SCGEMRNA_sce)

We first combine the cell-level data from the scRNA-seq data and the DNA methylation data.

coldata_bind <- bind_rows(as_tibble(colData(SCGEMRNA_sce)) %>% dplyr::mutate(Tech = "RNA"), as_tibble(colData(SCGEMMETH_sce)) %>% dplyr::mutate(Tech = "Methylation"))
head(coldata_bind)

Simulation

We first use the step-by-step functions to fit genes’ marginal models, copulas, and extract simulation parameters separately for the scRNA-seq data and the DNA methylation data.

set.seed(123)
RNA_data <- scDesign3::construct_data(
  SCGEMRNA_sce, 
  assay_use = "logcounts", 
  celltype = "cell_type", 
  pseudotime = NULL, 
  spatial = c("UMAP1_integrated", "UMAP2_integrated"), 
  other_covariates = NULL, 
  corr_by = "1"
  )
METH_data <- scDesign3::construct_data(
  SCGEMMETH_sce, 
  assay_use = "counts", 
  celltype = "cell_type", 
  pseudotime = NULL, 
  spatial = c("UMAP1_integrated", "UMAP2_integrated"), 
  other_covariates = NULL, 
  corr_by = "1")

Note here we actually treat the 2D aligned UMAPs as a kind of “pseudo”-spatial data. We use the tensor regression spline to fit two ref datasets seperately.

RNA_marginal <- scDesign3::fit_marginal(
  data = RNA_data, 
  predictor = "gene", 
  mu_formula = "te(UMAP1_integrated, UMAP2_integrated, bs = 'cr', k = 10)", 
  sigma_formula = "te(UMAP1_integrated, UMAP2_integrated, bs = 'cr', k = 5)", 
  family_use = "gaussian", 
  n_cores = 1, 
  usebam = FALSE)

METH_marginal <- scDesign3::fit_marginal(
  data = METH_data, 
  predictor = "gene", 
  mu_formula = "te(UMAP1_integrated, UMAP2_integrated, bs = 'cr', k = 10)", 
  sigma_formula = "1", 
  family_use = "binomial", 
  n_cores = 1, 
  usebam = FALSE)
RNA_copula <- scDesign3::fit_copula(
    sce = SCGEMRNA_sce,
    assay_use = "logcounts",
    marginal_list = RNA_marginal,
    family_use = "gaussian",
    copula = "vine",
    n_cores = 1,
    input_data = RNA_data$dat
  )

METH_copula <- scDesign3::fit_copula(
    sce = SCGEMMETH_sce,
    assay_use = "counts",
    marginal_list = METH_marginal,
    family_use = "binomial",
    copula = "vine",
    n_cores = 1,
    input_data = METH_data$dat
  )
RNA_para <- extract_para(
    sce = SCGEMRNA_sce,
    assay_use = "logcounts",
    marginal_list = RNA_marginal,
    n_cores = 1,
    family_use = "gaussian",
    new_covariate = rbind(RNA_data$dat, METH_data$dat),
    data = RNA_data$dat
  )

METH_para <- extract_para(
    sce = SCGEMMETH_sce,
    marginal_list = METH_marginal,
    n_cores = 1,
    family_use = "binomial",
    new_covariate = rbind(RNA_data$dat, METH_data$dat),
    data = METH_data$dat
  )

Simulate New Datasets

Then, we combined the cell covariates from both the scRNA-seq data and the DNA methylation data as the new covariate to simulate the two new datasets with parameters from scRNA-seq data and the DNA methylation data separately.

RNA_res <- simu_new(
    sce = SCGEMRNA_sce,
    assay_use = "logcounts",
    mean_mat = RNA_para$mean_mat,
    sigma_mat = RNA_para$sigma_mat,
    zero_mat = RNA_para$zero_mat,
    quantile_mat = NULL,
    copula_list = RNA_copula$copula_list,
    n_cores = 1,
    family_use = "gaussian",
    input_data = RNA_data$dat,
    new_covariate = rbind(RNA_data$dat, METH_data$dat),
    important_feature = RNA_copula$important_feature,
    filtered_gene = RNA_data$filtered_gene
  )
METH_res <- simu_new(
    sce = SCGEMMETH_sce,
    mean_mat = METH_para$mean_mat,
    sigma_mat = METH_para$sigma_mat,
    zero_mat = METH_para$zero_mat,
    quantile_mat = NULL,
    copula_list = METH_copula$copula_list,
    n_cores = 1,
    family_use = "binomial",
    input_data = METH_data$dat,
    new_covariate = rbind(RNA_data$dat, METH_data$dat),
    important_feature = METH_copula$important_feature,
    filtered_gene = METH_data$filtered_gene
  )

Visualization

We combine the two synthetic datasets and obtain the UMAP embeddings for the combined dataset.

count_combine <- rbind(RNA_res, METH_res)
count_combine_pca <- irlba::prcomp_irlba(t(count_combine), 5, scale. = TRUE)

count_combine_umap <- umap::umap(count_combine_pca$x, n_neighbors=30, min_dist=0.7)$layout
colnames(count_combine_umap) <- c("UMAP1", "UMAP2")

SCGEMNEW_sce <- SingleCellExperiment::SingleCellExperiment(list(logcounts = count_combine))
reducedDims(SCGEMNEW_sce) <- list(PCA = count_combine_pca$x, UMAP = count_combine_umap)
SCGEMRNA_umap <- umap::umap(colData(SCGEMRNA_sce) %>% as_tibble() %>% dplyr::select(paste0("X", 1:5)), n_neighbors=30, min_dist=0.7)
SCGEMRNA_umap <- SCGEMRNA_umap$layout
colnames(SCGEMRNA_umap) <- c("UMAP1", "UMAP2")
reducedDim(SCGEMRNA_sce, "UMAP") <- SCGEMRNA_umap
SCGEMMETH_umap <- umap::umap(colData(SCGEMMETH_sce) %>% as_tibble() %>% dplyr::select(paste0("X", 1:5)), n_neighbors=30, min_dist=0.7)
SCGEMMETH_umap <- SCGEMMETH_umap$layout
colnames(SCGEMMETH_umap) <- c("UMAP1", "UMAP2")
reducedDim(SCGEMMETH_sce, "UMAP") <- SCGEMMETH_umap
dat_RNA <- SCGEMRNA_umap %>% as_tibble() %>% dplyr::mutate(Method = "Real data: RNA")
dat_METH <- SCGEMMETH_umap %>% as_tibble() %>% dplyr::mutate(Method = "Real data: Methylation")
dat_NEW <- reducedDim(SCGEMNEW_sce, "UMAP") %>% as_tibble() %>% dplyr::mutate(Method = "scDesign3: RNA + Meythlation")
SCGEM_dat <- bind_rows(list(dat_RNA, dat_METH, dat_NEW))

Then, we reformat the UMAP embeddings for the inputted scRNA-seq data, DNA methylation data, and the combined synthetic data and visualize the UMAP embeddings.

design <- matrix(c(2,3,1,3), 2, 2) %>% t()
dat_text_SCGEM <- tibble(Method = c("Real data: RNA", "Real data: Methylation", "scDesign3: RNA + Meythlation"), label = c("32 Features*177 Cells", "27 Features*142 Cells", "59 Features*319 Cells")) %>% as.data.frame()
SCGEM_dat <- SCGEM_dat %>% dplyr::mutate(Method = factor(Method, levels = c("Real data: RNA", "scDesign3: RNA + Meythlation", "Real data: Methylation"))) %>% dplyr::mutate(UMAP1 = if_else(Method == "Real data: RNA", -UMAP1, UMAP1), UMAP2 = if_else(Method == "Real data: RNA", -UMAP2, UMAP2))

p_merge_modals <- ggplot(SCGEM_dat, aes(UMAP1, UMAP2, colour = Method)) + ggrastr::rasterize(geom_point(size = 0.5), dpi = 300) +
  guides(colour = "none") + scale_color_brewer(palette = "Set2") + theme_bw() + theme(aspect.ratio = 1,
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    axis.text.y=element_blank(),
    axis.ticks.y=element_blank()) + facet_manual(~Method, design = design, widths = c(1, 2), heights = c(1, 1), respect = TRUE, scales = "free")+ geom_text(
  data    = dat_text_SCGEM,
  mapping = aes(x = Inf, y = -Inf, label = label), vjust = -6, hjust = 1, color = "black", size = 3)
p_merge_modals

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] umap_0.2.10.0               ggh4x_0.2.5                
#>  [3] ggplot2_3.4.2               dplyr_1.1.2                
#>  [5] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#>  [7] Biobase_2.60.0              GenomicRanges_1.52.0       
#>  [9] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#> [11] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#> [13] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [15] scDesign3_0.99.6            BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        farver_2.1.1            vipor_0.4.5            
#>  [4] bitops_1.0-7            kde1d_1.0.5             fastmap_1.1.1          
#>  [7] RCurl_1.98-1.12         digest_0.6.33           lifecycle_1.0.3        
#> [10] Cairo_1.6-0             survival_3.5-5          gamlss.dist_6.0-5      
#> [13] magrittr_2.0.3          compiler_4.3.1          rlang_1.1.1            
#> [16] sass_0.4.7              tools_4.3.1             utf8_1.2.3             
#> [19] yaml_2.3.7              knitr_1.43              labeling_0.4.2         
#> [22] askpass_1.1             S4Arrays_1.0.4          mclust_6.0.0           
#> [25] reticulate_1.30         DelayedArray_0.26.6     RColorBrewer_1.1-3     
#> [28] withr_2.5.0             purrr_1.0.1             rvinecopulib_0.6.3.1.1 
#> [31] desc_1.4.2              grid_4.3.1              fansi_1.0.4            
#> [34] colorspace_2.1-0        scales_1.2.1            MASS_7.3-60            
#> [37] cli_3.6.1               rmarkdown_2.23          crayon_1.5.2           
#> [40] ragg_1.2.5              generics_0.1.3          RSpectra_0.16-1        
#> [43] ggbeeswarm_0.7.2        cachem_1.0.8            stringr_1.5.0          
#> [46] zlibbioc_1.46.0         splines_4.3.1           assertthat_0.2.1       
#> [49] parallel_4.3.1          ggrastr_1.0.2           BiocManager_1.30.21.1  
#> [52] XVector_0.40.0          vctrs_0.6.3             Matrix_1.6-0           
#> [55] jsonlite_1.8.7          bookdown_0.34           gamlss_5.4-12          
#> [58] beeswarm_0.4.0          irlba_2.3.5.1           systemfonts_1.0.4      
#> [61] jquerylib_0.1.4         glue_1.6.2              pkgdown_2.0.7          
#> [64] rngWELL_0.10-9          stringi_1.7.12          gtable_0.3.3           
#> [67] randtoolbox_2.0.4       munsell_0.5.0           tibble_3.2.1           
#> [70] pillar_1.9.0            htmltools_0.5.5         openssl_2.1.0          
#> [73] gamlss.data_6.0-2       GenomeInfoDbData_1.2.10 R6_2.5.1               
#> [76] textshaping_0.3.6       rprojroot_2.0.3         evaluate_0.21          
#> [79] lattice_0.21-8          highr_0.10              png_0.1-8              
#> [82] memoise_2.0.1           bslib_0.5.0             Rcpp_1.0.11            
#> [85] nlme_3.1-162            mgcv_1.9-0              xfun_0.39              
#> [88] fs_1.6.3                pkgconfig_2.0.3