前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >表达矩阵处理—表达QC(reads)

表达矩阵处理—表达QC(reads)

作者头像
生信技能树jimmy
发布2020-03-31 12:30:37
1K0
发布2020-03-31 12:30:37
举报
文章被收录于专栏:单细胞天地
7. 清理表达矩阵

7.2

表达QC(reads)

代码语言:javascript
复制
library(SingleCellExperiment)
library(scater)
options(stringsAsFactors = FALSE)
代码语言:javascript
复制
reads <- read.table("tung/reads.txt", sep = "\t")
anno <- read.table("tung/annotation.txt", sep = "\t", header = TRUE)
代码语言:javascript
复制
head(reads[ , 1:3])
代码语言:javascript
复制
##                 NA19098.r1.A01 NA19098.r1.A02 NA19098.r1.A03
## ENSG00000237683              0              0              0
## ENSG00000187634              0              0              0
## ENSG00000188976             57            140              1
## ENSG00000187961              0              0              0
## ENSG00000187583              0              0              0
## ENSG00000187642              0              0              0
代码语言:javascript
复制
head(anno)
代码语言:javascript
复制
##   individual replicate well      batch      sample_id
## 1    NA19098        r1  A01 NA19098.r1 NA19098.r1.A01
## 2    NA19098        r1  A02 NA19098.r1 NA19098.r1.A02
## 3    NA19098        r1  A03 NA19098.r1 NA19098.r1.A03
## 4    NA19098        r1  A04 NA19098.r1 NA19098.r1.A04
## 5    NA19098        r1  A05 NA19098.r1 NA19098.r1.A05
## 6    NA19098        r1  A06 NA19098.r1 NA19098.r1.A06
代码语言:javascript
复制
reads <- SingleCellExperiment(
    assays = list(counts = as.matrix(reads)),
    colData = anno
)
代码语言:javascript
复制
keep_feature <- rowSums(counts(reads) > 0) > 0
reads <- reads[keep_feature, ]
代码语言:javascript
复制
isSpike(reads, "ERCC") <- grepl("^ERCC-", rownames(reads))
isSpike(reads, "MT") <- rownames(reads) %in%
    c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888",
    "ENSG00000198886", "ENSG00000212907", "ENSG00000198786",
    "ENSG00000198695", "ENSG00000198712", "ENSG00000198804",
    "ENSG00000198763", "ENSG00000228253", "ENSG00000198938",
    "ENSG00000198840")
代码语言:javascript
复制
reads <- calculateQCMetrics(
    reads,
    feature_controls = list(
        ERCC = isSpike(reads, "ERCC"),
        MT = isSpike(reads, "MT")
    )
)
代码语言:javascript
复制
hist(
    reads$total_counts,
    breaks = 100
)
abline(v = 1.3e6, col = "red")
代码语言:javascript
复制
filter_by_total_counts <- (reads$total_counts > 1.3e6)
代码语言:javascript
复制
table(filter_by_total_counts)
代码语言:javascript
复制
## filter_by_total_counts
## FALSE  TRUE
##   180   684
代码语言:javascript
复制
hist(
    reads$total_features,
    breaks = 100
)
abline(v = 7000, col = "red")
代码语言:javascript
复制
filter_by_expr_features <- (reads$total_features > 7000)
代码语言:javascript
复制
table(filter_by_expr_features)
代码语言:javascript
复制
## filter_by_expr_features
## FALSE  TRUE
##   116   748
代码语言:javascript
复制
plotPhenoData(
    reads,
    aes_string(
        x = "total_features",
        y = "pct_counts_MT",
        colour = "batch"
    )
)
代码语言:javascript
复制
plotPhenoData(
    reads,
    aes_string(
        x = "total_features",
        y = "pct_counts_ERCC",
        colour = "batch"
    )
)
代码语言:javascript
复制
filter_by_ERCC <-
    reads$batch != "NA19098.r2" & reads$pct_counts_ERCC < 25
table(filter_by_ERCC)
代码语言:javascript
复制
## filter_by_ERCC
## FALSE  TRUE
##   103   761
代码语言:javascript
复制
filter_by_MT <- reads$pct_counts_MT < 30
table(filter_by_MT)
代码语言:javascript
复制
## filter_by_MT
## FALSE  TRUE
##    18   846
代码语言:javascript
复制
reads$use <- (
    # sufficient features (genes)
    filter_by_expr_features &
    # sufficient molecules counted
    filter_by_total_counts &
    # sufficient endogenous RNA
    filter_by_ERCC &
    # remove cells with unusual number of reads in MT genes
    filter_by_MT
)
代码语言:javascript
复制
table(reads$use)
代码语言:javascript
复制
##
## FALSE  TRUE
##   258   606
代码语言:javascript
复制
reads <- plotPCA(
    reads,
    size_by = "total_features",
    shape_by = "use",
    pca_data_input = "pdata",
    detect_outliers = TRUE,
    return_SCE = TRUE
)
代码语言:javascript
复制
table(reads$outlier)
代码语言:javascript
复制
##
## FALSE  TRUE
##   756   108
代码语言:javascript
复制
library(limma)
代码语言:javascript
复制
##
## Attaching package: 'limma'
代码语言:javascript
复制
## The following object is masked from 'package:scater':
##
##     plotMDS
代码语言:javascript
复制
## The following object is masked from 'package:BiocGenerics':
##
##     plotMA
代码语言:javascript
复制
auto <- colnames(reads)[reads$outlier]
man <- colnames(reads)[!reads$use]
venn.diag <- vennCounts(
    cbind(colnames(reads) %in% auto,
    colnames(reads) %in% man)
)
vennDiagram(
    venn.diag,
    names = c("Automatic", "Manual"),
    circle.col = c("blue", "green")
)
代码语言:javascript
复制
plotQC(reads, type = "highest-expression")
代码语言:javascript
复制
filter_genes <- apply(
    counts(reads[, colData(reads)$use]),
    1,
    function(x) length(x[x > 1]) >= 2
)
rowData(reads)$use <- filter_genes
代码语言:javascript
复制
table(filter_genes)
代码语言:javascript
复制
## filter_genes
## FALSE  TRUE
##  2664 16062
代码语言:javascript
复制
dim(reads[rowData(reads)$use, colData(reads)$use])
代码语言:javascript
复制
## [1] 16062   606
代码语言:javascript
复制
assay(reads, "logcounts_raw") <- log2(counts(reads) + 1)
reducedDim(reads) <- NULL
代码语言:javascript
复制
saveRDS(reads, file = "tung/reads.rds")

通过比较图7.6和图7.13,很明显基于read的过滤比基于UMI的分析去除了更多的细胞。如果您返回并比较结果,您应该能够得出结论,ERCC和MT过滤器对于基于read的分析更严格。

代码语言:javascript
复制
sessionInfo()
代码语言:javascript
复制
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.19.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel  stats4    methods   stats     graphics  grDevices utils
## [8] datasets  base
##
## other attached packages:
##  [1] limma_3.34.9               scater_1.6.3
##  [3] ggplot2_2.2.1              SingleCellExperiment_1.0.0
##  [5] SummarizedExperiment_1.8.1 DelayedArray_0.4.1
##  [7] matrixStats_0.53.1         Biobase_2.38.0
##  [9] GenomicRanges_1.30.3       GenomeInfoDb_1.14.0
## [11] IRanges_2.12.0             S4Vectors_0.16.0
## [13] BiocGenerics_0.24.0        knitr_1.20
##
## loaded via a namespace (and not attached):
##   [1] backports_1.1.2        plyr_1.8.4             lazyeval_0.2.1
##   [4] sp_1.2-7               shinydashboard_0.6.1   splines_3.4.3
##   [7] digest_0.6.15          htmltools_0.3.6        viridis_0.5.0
##  [10] magrittr_1.5           memoise_1.1.0          cluster_2.0.6
##  [13] prettyunits_1.0.2      colorspace_1.3-2       blob_1.1.0
##  [16] rrcov_1.4-3            xfun_0.1               dplyr_0.7.4
##  [19] RCurl_1.95-4.10        tximport_1.6.0         lme4_1.1-15
##  [22] bindr_0.1              zoo_1.8-1              glue_1.2.0
##  [25] gtable_0.2.0           zlibbioc_1.24.0        XVector_0.18.0
##  [28] MatrixModels_0.4-1     car_2.1-6              kernlab_0.9-25
##  [31] prabclus_2.2-6         DEoptimR_1.0-8         SparseM_1.77
##  [34] VIM_4.7.0              scales_0.5.0           sgeostat_1.0-27
##  [37] mvtnorm_1.0-7          DBI_0.7                GGally_1.3.2
##  [40] edgeR_3.20.9           Rcpp_0.12.15           sROC_0.1-2
##  [43] viridisLite_0.3.0      xtable_1.8-2           progress_1.1.2
##  [46] laeken_0.4.6           bit_1.1-12             mclust_5.4
##  [49] vcd_1.4-4              httr_1.3.1             RColorBrewer_1.1-2
##  [52] fpc_2.1-11             modeltools_0.2-21      pkgconfig_2.0.1
##  [55] reshape_0.8.7          XML_3.98-1.10          flexmix_2.3-14
##  [58] nnet_7.3-12            locfit_1.5-9.1         labeling_0.3
##  [61] rlang_0.2.0            reshape2_1.4.3         AnnotationDbi_1.40.0
##  [64] munsell_0.4.3          tools_3.4.3            RSQLite_2.0
##  [67] pls_2.6-0              evaluate_0.10.1        stringr_1.3.0
##  [70] cvTools_0.3.2          yaml_2.1.17            bit64_0.9-7
##  [73] robustbase_0.92-8      bindrcpp_0.2           nlme_3.1-129
##  [76] mime_0.5               quantreg_5.35          biomaRt_2.34.2
##  [79] compiler_3.4.3         pbkrtest_0.4-7         beeswarm_0.2.3
##  [82] e1071_1.6-8            tibble_1.4.2           robCompositions_2.0.6
##  [85] pcaPP_1.9-73           stringi_1.1.6          highr_0.6
##  [88] lattice_0.20-34        trimcluster_0.1-2      Matrix_1.2-7.1
##  [91] nloptr_1.0.4           pillar_1.2.1           lmtest_0.9-35
##  [94] data.table_1.10.4-3    cowplot_0.9.2          bitops_1.0-6
##  [97] httpuv_1.3.6.1         R6_2.2.2               bookdown_0.7
## [100] gridExtra_2.3          vipor_0.4.5            boot_1.3-18
## [103] MASS_7.3-45            assertthat_0.2.0       rhdf5_2.22.0
## [106] rprojroot_1.3-2        rjson_0.2.15           GenomeInfoDbData_1.0.0
## [109] diptest_0.75-7         mgcv_1.8-23            grid_3.4.3
## [112] class_7.3-14           minqa_1.2.4            rmarkdown_1.8
## [115] mvoutlier_2.0.9        shiny_1.0.5            ggbeeswarm_0.6.0
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2019-06-27,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 单细胞天地 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档