Skip to contents

Introduction

In this tutorial, we will show how to use scDesign3 to simulate CITE-seq data and illustrate the similarity between the inputted reference data and synthetic data.

Read in the reference data

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581968")))
print(example_sce)

To save computational time, we only use the top 100 genes and six more genes with protein and RNA abundance information.

keep_gene <- c("CD4",  "CD14", "CD19", "CD34", "CD3E", "CD8A")
keep_adt <- c("ADT_CD4", "ADT_CD14", "ADT_CD19", "ADT_CD34", "ADT_CD3", "ADT_CD8")
keep <- c(keep_gene, keep_adt)
idx <- which(rownames(example_sce) %in% keep)
idx <- c(1:100,idx)
example_sce <- example_sce[idx,]
logcounts(example_sce) <- log1p(counts(example_sce))

Simulation

We input the reference data and use the one-shot scdesign3() function to simulate CITE-seq dat using discrete cell types as the covariates for fitting each gene’s marginal distribution.

set.seed(123)
example_simu <- scdesign3(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    corr_formula = "cell_type",
    copula = "vine",
    DT = TRUE,
    pseudo_obs = FALSE,
    return_model = FALSE,
    nonzerovar = TRUE,
    nonnegative = TRUE
  )

After the simulation, we can create the SinglecellExperiment object using the synthetic count matrix and store the logcounts to the input and synthetic SinglecellExperiment objects.

logcounts(example_sce) <- log1p(counts(example_sce))
simu_sce <- SingleCellExperiment(list(counts = example_simu$new_count), colData = example_simu$new_covariate)
logcounts(simu_sce) <- log1p(counts(simu_sce))

Then, we obtained the PCA and UMAP for both the inputted reference data and the synthetic data. These sets of embedding will be used for the visualization below.

set.seed(123)
 train_pca_fit <- irlba::prcomp_irlba(t(log1p(counts(example_sce))), 
                                          center = TRUE, 
                                          scale. = FALSE, 
                                          n = 50)
reducedDim(simu_sce, "PCA") <- predict(train_pca_fit, newdata= t(log1p(counts(simu_sce))))
simu_pac_fit <- predict(train_pca_fit, newdata= t(logcounts(simu_sce)))
train_umap_fit <- umap::umap(train_pca_fit$x, n_neighbors = 15, min_dist = 0.1)
simu_umap_fit <-  predict(object = train_umap_fit, data= (reducedDim(simu_sce, "PCA")))
colnames(simu_umap_fit ) <- c("UMAP1", "UMAP2")
reducedDim(simu_sce, "UMAP") <- simu_umap_fit 
train_umap <- train_umap_fit$layout
rownames(train_umap) <- colnames(example_sce)
colnames(train_umap) <- c("UMAP1", "UMAP2")

Visualization

To visualize the results, we select six genes and reformat their UMAP embedding we got in the previous step.

expression_train <- as.matrix(logcounts(example_sce))[c(keep_gene ,keep_adt), ] %>% t()  %>% as_tibble() %>% bind_cols(train_umap) %>% dplyr::mutate(Method = "Train data")
expression_scDesign3 <- as.matrix(logcounts(simu_sce))[c(keep_gene ,keep_adt), ] %>% t() %>% as_tibble() %>% bind_cols(simu_umap_fit) %>% dplyr::mutate(Method = "scDesign3")
CITE_dat <- bind_rows(expression_train, expression_scDesign3) %>% as_tibble() %>%
            dplyr::mutate_at(vars(-c(UMAP1, UMAP2, Method)), funs(scales::rescale)) %>% tidyr::pivot_longer(-c("UMAP1", "UMAP2", "Method"), names_to = "Feature", values_to = "Expression") %>% dplyr::mutate(Type = if_else(str_detect(Feature, "ADT"), "Protein", "RNA")) %>% dplyr::mutate(Gene = str_replace(Feature, "ADT_", "")) %>% dplyr::mutate(Gene = if_else(Gene == "CD3E", "CD3", Gene))%>% dplyr::mutate(Gene = if_else(Gene == "CD8A", "CD8", Gene))%>% dplyr::filter(Gene %in% c("CD14", "CD3", "CD8", "CD19")) %>% dplyr::mutate(Gene = factor(Gene, levels = c("CD3", "CD8", "CD14", "CD19"))) %>% dplyr::mutate(Method = factor(Method, levels = c("Train data", "scDesign3")))
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#>  Please use a list of either functions or lambdas:
#> 
#> # Simple named list: list(mean = mean, median = median)
#> 
#> # Auto named with `tibble::lst()`: tibble::lst(mean, median)
#> 
#> # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
head(CITE_dat)
#> # A tibble: 6 × 7
#>    UMAP1 UMAP2 Method     Feature  Expression Type    Gene 
#>    <dbl> <dbl> <fct>      <chr>         <dbl> <chr>   <fct>
#> 1 -0.883  14.5 Train data CD14          0     RNA     CD14 
#> 2 -0.883  14.5 Train data CD19          0     RNA     CD19 
#> 3 -0.883  14.5 Train data CD3E          0     RNA     CD3  
#> 4 -0.883  14.5 Train data CD8A          0     RNA     CD8  
#> 5 -0.883  14.5 Train data ADT_CD14      0.623 Protein CD14 
#> 6 -0.883  14.5 Train data ADT_CD19      0.527 Protein CD19

Six genes’ protein and RNA abundances are shown on the cell UMAP embeddings in the inputted reference data and the synthetic data below.

CITE_dat  %>% ggplot(aes(x = UMAP1, y = UMAP2, color = Expression)) + geom_point(size = 0.1, alpha = 0.5) + scale_colour_gradientn(colors = viridis_pal(option = "A", direction = -1)(10), limits=c(0, 1)) + coord_fixed(ratio = 1) + facet_nested(Method ~ Gene + Type ) + theme(aspect.ratio = 1, legend.position = "bottom")  + theme(aspect.ratio = 1, legend.position = "right") + theme(
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    axis.text.y=element_blank(),
    axis.ticks.y=element_blank())

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] ggh4x_0.2.5                 scales_1.2.1               
#>  [3] tidyr_1.3.0                 stringr_1.5.0              
#>  [5] ggplot2_3.4.2               dplyr_1.1.2                
#>  [7] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#>  [9] Biobase_2.60.0              GenomicRanges_1.52.0       
#> [11] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#> [13] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#> [15] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [17] scDesign3_0.99.6            BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        viridisLite_0.4.2       farver_2.1.1           
#>  [4] bitops_1.0-7            fastmap_1.1.1           RCurl_1.98-1.12        
#>  [7] kde1d_1.0.5             digest_0.6.33           lifecycle_1.0.3        
#> [10] survival_3.5-5          gamlss.dist_6.0-5       magrittr_2.0.3         
#> [13] compiler_4.3.1          rlang_1.1.1             sass_0.4.7             
#> [16] tools_4.3.1             utf8_1.2.3              yaml_2.3.7             
#> [19] knitr_1.43              labeling_0.4.2          askpass_1.1            
#> [22] S4Arrays_1.0.4          mclust_6.0.0            reticulate_1.30        
#> [25] DelayedArray_0.26.6     withr_2.5.0             purrr_1.0.1            
#> [28] rvinecopulib_0.6.3.1.1  desc_1.4.2              grid_4.3.1             
#> [31] fansi_1.0.4             colorspace_2.1-0        MASS_7.3-60            
#> [34] cli_3.6.1               rmarkdown_2.23          crayon_1.5.2           
#> [37] ragg_1.2.5              generics_0.1.3          umap_0.2.10.0          
#> [40] RSpectra_0.16-1         cachem_1.0.8            zlibbioc_1.46.0        
#> [43] splines_4.3.1           assertthat_0.2.1        parallel_4.3.1         
#> [46] BiocManager_1.30.21.1   XVector_0.40.0          vctrs_0.6.3            
#> [49] Matrix_1.6-0            jsonlite_1.8.7          bookdown_0.34          
#> [52] gamlss_5.4-12           irlba_2.3.5.1           systemfonts_1.0.4      
#> [55] jquerylib_0.1.4         glue_1.6.2              pkgdown_2.0.7          
#> [58] rngWELL_0.10-9          stringi_1.7.12          gtable_0.3.3           
#> [61] randtoolbox_2.0.4       munsell_0.5.0           tibble_3.2.1           
#> [64] pillar_1.9.0            htmltools_0.5.5         openssl_2.1.0          
#> [67] gamlss.data_6.0-2       GenomeInfoDbData_1.2.10 R6_2.5.1               
#> [70] textshaping_0.3.6       rprojroot_2.0.3         evaluate_0.21          
#> [73] lattice_0.21-8          highr_0.10              png_0.1-8              
#> [76] memoise_2.0.1           bslib_0.5.0             Rcpp_1.0.11            
#> [79] nlme_3.1-162            mgcv_1.9-0              xfun_0.39              
#> [82] fs_1.6.3                pkgconfig_2.0.3