Simulate single-cell ATAC-seq data

Dongyuan Song

Bioinformatics IDP, University of California, Los Angeles
dongyuansong@ucla.edu

Guanao Yan

Department of Statistics, University of California, Los Angeles
gayan@g.ucla.edu

Qingyang Wang

Department of Statistics, University of California, Los Angeles
qw802@g.ucla.edu

15 September 2023

Source: ../../scDesign3/code/vignettes/scDesign3-scATACseq-vignette.Rmd

scDesign3-scATACseq-vignette.Rmd

library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
theme_set(theme_bw())

Introduction

In this tutorial, we show how to use scDesign3 to simulate the peak by cell matrix of scATAC-seq data.

Read in the reference data

The raw data is from the Signac, which is of human peripheral blood mononuclear cells (PBMCs) provided by 10x Genomics. We pre-select the differentially accessible peaks between clusters.

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581962")))
print(example_sce)
#> class: SingleCellExperiment 
#> dim: 1133 7034 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(1133): chr6-44025105-44028184 chr2-113581628-113594911 ...
#>   chr2-233174300-233175622 chr3-13015237-13015864
#> rowData names(0):
#> colnames(7034): AAACGAAAGAGCGAAA-1 AAACGAAAGAGTTTGA-1 ...
#>   TTTGTGTTCTACTTTG-1 TTTGTGTTCTTGTGCC-1
#> colData names(31): orig.ident nCount_peaks ... ident cell_type
#> reducedDimNames(2): LSI UMAP
#> mainExpName: peaks
#> altExpNames(0):

To save time, we subset 1000 cells.

set.seed(123)
example_sce <- example_sce[, sample(seq_len(dim(example_sce)[2]), 1000, replace = FALSE)]

Simulation

Here we choose the Zero-inflated Poisson (ZIP) as the distribution due to its good empirical performance. Users may explore other distributions (Poisson, NB, ZINB) since there is no conclusion on the best distribution of ATAC-seq.

set.seed(123)
example_simu <- scdesign3(    
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    mu_formula = "cell_type",
    sigma_formula = "1",
    family_use = "zip",
    n_cores = 1,
    usebam = FALSE,
    corr_formula = "cell_type",
    copula = "gaussian",
    DT = TRUE,
    pseudo_obs = FALSE,
    return_model = FALSE,
    nonzerovar = FALSE
  )

We also run the TF-IDF transformation.

tf_idf <- function(Y){
  frequences <- colSums(Y)
  nfreqs <- t(apply(Y, 1, function(x){x/frequences}))
  nfreqs[is.na(nfreqs)] <- 0
  idf <- log(1 + ncol(Y)) - log(rowSums(Y > 0) + 1) + 1
  Y_idf <- apply(nfreqs, 2, function(x){x * idf})
  return(Y_idf)
}
assay(example_sce, "tfidf") <- as.matrix(tf_idf(counts(example_sce)))
simu_sce <- SingleCellExperiment(list(counts = example_simu$new_count), colData = example_simu$new_covariate)
assay(simu_sce, "tfidf") <- as.matrix(tf_idf(counts(simu_sce)))

Visualization

set.seed(123)
compare_figure <- plot_reduceddim(ref_sce = example_sce,
                                  sce_list = list(simu_sce),
                                  name_vec = c("Reference", "scDesign3"),
                                  assay_use = "tfidf",
                                  if_plot = TRUE,
                                  color_by = "cell_type",
                                  n_pc = 20)
plot(compare_figure$p_umap)

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] ggplot2_3.4.2               SingleCellExperiment_1.22.0
#>  [3] SummarizedExperiment_1.30.2 Biobase_2.60.0             
#>  [5] GenomicRanges_1.52.0        GenomeInfoDb_1.36.1        
#>  [7] IRanges_2.34.1              S4Vectors_0.38.1           
#>  [9] BiocGenerics_0.46.0         MatrixGenerics_1.12.2      
#> [11] matrixStats_1.0.0           scDesign3_0.99.6           
#> [13] BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        farver_2.1.1            dplyr_1.1.2            
#>  [4] bitops_1.0-7            fastmap_1.1.1           RCurl_1.98-1.12        
#>  [7] digest_0.6.33           lifecycle_1.0.3         survival_3.5-5         
#> [10] gamlss.dist_6.0-5       magrittr_2.0.3          compiler_4.3.1         
#> [13] rlang_1.1.1             sass_0.4.7              tools_4.3.1            
#> [16] utf8_1.2.3              yaml_2.3.7              knitr_1.43             
#> [19] askpass_1.1             S4Arrays_1.0.4          labeling_0.4.2         
#> [22] mclust_6.0.0            reticulate_1.30         DelayedArray_0.26.6    
#> [25] withr_2.5.0             purrr_1.0.1             desc_1.4.2             
#> [28] grid_4.3.1              fansi_1.0.4             colorspace_2.1-0       
#> [31] scales_1.2.1            MASS_7.3-60             cli_3.6.1              
#> [34] mvtnorm_1.2-2           rmarkdown_2.23          crayon_1.5.2           
#> [37] ragg_1.2.5              generics_0.1.3          umap_0.2.10.0          
#> [40] RSpectra_0.16-1         cachem_1.0.8            stringr_1.5.0          
#> [43] zlibbioc_1.46.0         splines_4.3.1           parallel_4.3.1         
#> [46] BiocManager_1.30.21.1   XVector_0.40.0          vctrs_0.6.3            
#> [49] Matrix_1.6-0            jsonlite_1.8.7          bookdown_0.34          
#> [52] gamlss_5.4-12           irlba_2.3.5.1           systemfonts_1.0.4      
#> [55] jquerylib_0.1.4         glue_1.6.2              pkgdown_2.0.7          
#> [58] stringi_1.7.12          gtable_0.3.3            munsell_0.5.0          
#> [61] tibble_3.2.1            pillar_1.9.0            htmltools_0.5.5        
#> [64] openssl_2.1.0           gamlss.data_6.0-2       GenomeInfoDbData_1.2.10
#> [67] R6_2.5.1                textshaping_0.3.6       rprojroot_2.0.3        
#> [70] evaluate_0.21           lattice_0.21-8          highr_0.10             
#> [73] png_0.1-8               memoise_2.0.1           bslib_0.5.0            
#> [76] Rcpp_1.0.11             nlme_3.1-162            mgcv_1.9-0             
#> [79] xfun_0.39               fs_1.6.3                pkgconfig_2.0.3