Skip to contents

Introduction

In this tutorial, we will show how to use scDesign3 to simulate data with original batch effects and how to remove the batch effects. We will also demostrate how to add ariticial batch effects.

Read in the reference data

The raw data is from the SeuratData package. The data is called pbmcsca in the package; it is PBMC Systematic Comparative Analysis dataset from the Broad Institute.

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581965")))
print(example_sce)
#> class: SingleCellExperiment 
#> dim: 1000 6276 
#> metadata(0):
#> assays(2): counts logcounts
#> rownames(1000): LYZ GNLY ... PBXIP1 SECTM1
#> rowData names(0):
#> colnames(6276): pbmc2_10X_V2_AAACCTGAGATGGGTC
#>   pbmc2_10X_V2_AAACCTGAGCGTAATA ... pbmc1_10x_v3_TTGAACGCATGCAGCC
#>   pbmc1_10x_v3_TTTGACTAGTGTTCCA
#> colData names(13): phenoid orig.ident ... batch cell_type
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):

To save computational time, we only use the top 100 genes.

example_sce <- example_sce[1:100, ]

The column batch in this example dataset’s colData contains the batch information.

head(colData(example_sce))
#> DataFrame with 6 rows and 13 columns
#>                                        phenoid orig.ident nCount_RNA
#>                                    <character>   <factor>  <numeric>
#> pbmc2_10X_V2_AAACCTGAGATGGGTC           B cell      pbmc2       2360
#> pbmc2_10X_V2_AAACCTGAGCGTAATA           B cell      pbmc2       1888
#> pbmc2_10X_V2_AAACCTGAGCTAGGCA Cytotoxic T cell      pbmc2       3456
#> pbmc2_10X_V2_AAACCTGAGGGTCTCC   Dendritic cell      pbmc2       3802
#> pbmc2_10X_V2_AAACCTGGTCCGAACC      CD4+ T cell      pbmc2       3826
#> pbmc2_10X_V2_AAACCTGTCGTCCGTT      CD4+ T cell      pbmc2       2345
#>                               nFeature_RNA       nGene        nUMI
#>                                  <integer> <character> <character>
#> pbmc2_10X_V2_AAACCTGAGATGGGTC         1044        1044        2360
#> pbmc2_10X_V2_AAACCTGAGCGTAATA          803         803        1888
#> pbmc2_10X_V2_AAACCTGAGCTAGGCA         1372        1372        3456
#> pbmc2_10X_V2_AAACCTGAGGGTCTCC         1519        1519        3802
#> pbmc2_10X_V2_AAACCTGGTCCGAACC         1451        1451        3826
#> pbmc2_10X_V2_AAACCTGTCGTCCGTT          931         931        2345
#>                                     percent.mito     Cluster  Experiment
#>                                      <character> <character> <character>
#> pbmc2_10X_V2_AAACCTGAGATGGGTC 0.0419491525423729           2       pbmc2
#> pbmc2_10X_V2_AAACCTGAGCGTAATA 0.0413135593220339           2       pbmc2
#> pbmc2_10X_V2_AAACCTGAGCTAGGCA 0.0353009259259259           1       pbmc2
#> pbmc2_10X_V2_AAACCTGAGGGTCTCC 0.0420831141504471           6       pbmc2
#> pbmc2_10X_V2_AAACCTGGTCCGAACC 0.0371144798745426           0       pbmc2
#> pbmc2_10X_V2_AAACCTGTCGTCCGTT 0.0652452025586354           0       pbmc2
#>                                          Method    ident             batch
#>                                     <character> <factor>       <character>
#> pbmc2_10X_V2_AAACCTGAGATGGGTC 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#> pbmc2_10X_V2_AAACCTGAGCGTAATA 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#> pbmc2_10X_V2_AAACCTGAGCTAGGCA 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#> pbmc2_10X_V2_AAACCTGAGGGTCTCC 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#> pbmc2_10X_V2_AAACCTGGTCCGAACC 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#> pbmc2_10X_V2_AAACCTGTCGTCCGTT 10x Chromium (v2)    pbmc2 10x Chromium (v2)
#>                                      cell_type
#>                                    <character>
#> pbmc2_10X_V2_AAACCTGAGATGGGTC           B cell
#> pbmc2_10X_V2_AAACCTGAGCGTAATA           B cell
#> pbmc2_10X_V2_AAACCTGAGCTAGGCA Cytotoxic T cell
#> pbmc2_10X_V2_AAACCTGAGGGTCTCC   Dendritic cell
#> pbmc2_10X_V2_AAACCTGGTCCGAACC      CD4+ T cell
#> pbmc2_10X_V2_AAACCTGTCGTCCGTT      CD4+ T cell

Simulation

We can simulate a new data with batch effect information.

set.seed(123)
simu_res <- scdesign3(sce = example_sce, 
                              assay_use = "counts", 
                              celltype = "cell_type", 
                              pseudotime = NULL, 
                              spatial = NULL, 
                              other_covariates = c("batch"), 
                              mu_formula = "cell_type + batch", 
                              sigma_formula = "1", 
                              family_use = "nb", 
                              n_cores = 2, 
                              usebam = FALSE, 
                              corr_formula = "1", 
                              copula = "gaussian", 
                              DT = TRUE, 
                              pseudo_obs = FALSE, 
                              return_model = FALSE)

We can also remove the batch effect and generate new data.

BATCH_data <- construct_data(
  sce = example_sce,
  assay_use = "counts",
  celltype = "cell_type",
  pseudotime = NULL,
  spatial = NULL,
  other_covariates = c("batch"),
  corr_by = "1"
)
BATCH_marginal <- fit_marginal(
  data = BATCH_data,
  predictor = "gene",
  mu_formula = "cell_type + batch",
  sigma_formula = "1",
  family_use = "nb",
  n_cores = 2,
  usebam = FALSE
)
set.seed(123)
BATCH_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = BATCH_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = BATCH_data$dat
  )

In here, we remove the batch effect by setting its coefficient to zero for all genes’ marginal fits. Then, we use the new sets of coefficients to generate the parameters for all genes across all cells.

BATCH_marginal_null <- lapply(BATCH_marginal, function(x) {
  x$fit$coefficients[length(x$fit$coefficients)] <- 0
  x
})
BATCH_para_null <- extract_para(
    sce = example_sce,
    marginal_list = BATCH_marginal_null,
    n_cores = 2,
    family_use = "nb",
    new_covariate =  BATCH_data$newCovariate,
    data = BATCH_data$dat
  )
set.seed(123)
BATCH_newcount_null <- simu_new(
    sce = example_sce,
    mean_mat = BATCH_para_null$mean_mat,
    sigma_mat = BATCH_para_null$sigma_mat,
    zero_mat = BATCH_para_null$zero_mat,
    quantile_mat = NULL,
    copula_list = BATCH_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = BATCH_data$dat,
    new_covariate = BATCH_data$newCovariate,
    important_feature = BATCH_copula$important_feature,
    filtered_gene = BATCH_data$filtered_gene
  )

Additionally, we can alter the batch effect information by mannually change the estimated coefficient for batch effect in each gene’s marginal model. Then, we can simulate new dataset with altered batch effect information.

BATCH_marginal_alter <- lapply(BATCH_marginal, function(x) {
  x$fit$coefficients[length(x$fit$coefficients)] <- rnorm(1, mean = 5, sd = 2)
  x
})
BATCH_para_alter <- extract_para(
    sce = example_sce,
    marginal_list = BATCH_marginal_alter,
    n_cores = 2,
    family_use = "nb",
    new_covariate = BATCH_data$newCovariate,
    data = BATCH_data$dat
  )
set.seed(123)
BATCH_newcount_alter <- simu_new(
    sce = example_sce,
    mean_mat = BATCH_para_alter$mean_mat,
    sigma_mat = BATCH_para_alter$sigma_mat,
    zero_mat = BATCH_para_alter$zero_mat,
    quantile_mat = NULL,
    copula_list = BATCH_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = BATCH_data$dat,
    new_covariate = BATCH_data$newCovariate,
    important_feature = BATCH_copula$important_feature,
    filtered_gene = BATCH_data$filtered_gene
  )

We create three SingleCellExperiment objects using three count matrices generated above and store their logcounts.

simu_res_list <- lapply(list(simu_res$new_count, BATCH_newcount_null, BATCH_newcount_alter), function(x){
  simu_sce <- SingleCellExperiment(list(counts = x), colData = BATCH_data$newCovariate)
  logcounts(simu_sce) <- log1p(counts(simu_sce))
  return(simu_sce)
})

Visulization

set.seed(123)
compare_figure <- plot_reduceddim(ref_sce = example_sce, 
                                  sce_list = simu_res_list, 
                                  name_vec = c("Reference", "w/ Batch", "w/o Batch","Aritifical Batch"),
                                  assay_use = "logcounts", 
                                  if_plot = TRUE, 
                                  color_by = "cell_type", 
                                  shape_by = "batch",
                                  n_pc = 20)
plot(compare_figure$p_umap)

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#>  [3] Biobase_2.60.0              GenomicRanges_1.52.0       
#>  [5] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#>  [7] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#>  [9] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [11] ggplot2_3.4.2               scDesign3_0.99.6           
#> [13] BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        farver_2.1.1            dplyr_1.1.2            
#>  [4] bitops_1.0-7            fastmap_1.1.1           RCurl_1.98-1.12        
#>  [7] digest_0.6.33           lifecycle_1.0.3         survival_3.5-5         
#> [10] gamlss.dist_6.0-5       magrittr_2.0.3          compiler_4.3.1         
#> [13] rlang_1.1.1             sass_0.4.7              tools_4.3.1            
#> [16] utf8_1.2.3              yaml_2.3.7              knitr_1.43             
#> [19] labeling_0.4.2          askpass_1.1             S4Arrays_1.0.4         
#> [22] mclust_6.0.0            reticulate_1.30         DelayedArray_0.26.6    
#> [25] withr_2.5.0             purrr_1.0.1             desc_1.4.2             
#> [28] grid_4.3.1              fansi_1.0.4             colorspace_2.1-0       
#> [31] scales_1.2.1            MASS_7.3-60             cli_3.6.1              
#> [34] mvtnorm_1.2-2           rmarkdown_2.23          crayon_1.5.2           
#> [37] ragg_1.2.5              generics_0.1.3          umap_0.2.10.0          
#> [40] RSpectra_0.16-1         cachem_1.0.8            stringr_1.5.0          
#> [43] zlibbioc_1.46.0         splines_4.3.1           parallel_4.3.1         
#> [46] BiocManager_1.30.21.1   XVector_0.40.0          vctrs_0.6.3            
#> [49] Matrix_1.6-0            jsonlite_1.8.7          bookdown_0.34          
#> [52] gamlss_5.4-12           irlba_2.3.5.1           systemfonts_1.0.4      
#> [55] jquerylib_0.1.4         glue_1.6.2              pkgdown_2.0.7          
#> [58] stringi_1.7.12          gtable_0.3.3            munsell_0.5.0          
#> [61] tibble_3.2.1            pillar_1.9.0            htmltools_0.5.5        
#> [64] openssl_2.1.0           gamlss.data_6.0-2       GenomeInfoDbData_1.2.10
#> [67] R6_2.5.1                textshaping_0.3.6       rprojroot_2.0.3        
#> [70] evaluate_0.21           lattice_0.21-8          highr_0.10             
#> [73] png_0.1-8               memoise_2.0.1           bslib_0.5.0            
#> [76] Rcpp_1.0.11             nlme_3.1-162            mgcv_1.9-0             
#> [79] xfun_0.39               fs_1.6.3                pkgconfig_2.0.3