我们的新专辑《Github带有全套代码分享的文献复现2025》开启后受到大家的热烈喜爱,里面学习的文章为:《A multi-omic single-cell landscape of human gynecologic malignancies》,前面的学习笔记如下:
今天继续来学习他的代码wiki上的:https://github.com/RegnerM2015/scENDO_scOVAR_2020/wiki
前面学习完了数据预处理QC,DoubletFinder双细胞鉴定以及 DoubletDecon 双细胞鉴定。
作者对每个样本做完了QC,双细胞过滤后,进行了数据合并,并标准化、特征选择以及聚类:
作者前面的数据预处理每个样本都有一个脚本,并且很复杂,但是我们已经学完了里面的思路,在后面的步骤中,我们选择一个比较简单的方法进行预处理与合并然后接上作者后面的代码。
数据前面预处理的思路可以看曾老板写的两个超棒的帖子:
https://ftp.ncbi.nlm.nih.gov/geo/series/GSE173nnn/GSE173682/suppl/GSE173682_RAW.tar
GSE173682/scRNA-Seq/
├── GSM5276933
│ ├── barcodes.tsv.gz
│ ├── features.tsv.gz
│ └── matrix.mtx.gz
├── GSM5276934
│ ├── barcodes.tsv.gz
│ ├── features.tsv.gz
│ └── matrix.mtx.gz
├── GSM5276935
│ ├── barcodes.tsv.gz
│ ├── features.tsv.gz
│ └── matrix.mtx.gz
├── GSM5276936
│ ├── barcodes.tsv.gz
│ ├── features.tsv.gz
│ └── matrix.mtx.gz
├── GSM5276937
│ ├── barcodes.tsv.gz
│ ├── features.tsv.gz
│ └── matrix.mtx.gz
................
共11个样本,得到 86729 个细胞。
###
### Create: Jianming Zeng
### Date: 2023-12-31
### Email: jmzeng1314@163.com
### Blog: http://www.bio-info-trainee.com/
### Forum: http://www.biotrainee.com/thread-1376-1-1.html
### CAFS/SUSTC/Eli Lilly/University of Macau
### Update Log: 2023-12-31 First version
### Update Log: 2024-12-09 by juan zhang (492482942@qq.com)
###
rm(list=ls())
options(stringsAsFactors = F)
library(ggsci)
library(dplyr)
library(future)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(ggplot2)
library(patchwork)
library(stringr)
library(qs)
library(Matrix)
getwd()
###### step1: 导入数据 ######
samples <- list.dirs("GSE173682/scRNA-Seq/", recursive = F, full.names = F)
samples
scRNAlist <- lapply(samples, function(pro){
#pro <- samples[1]
print(pro)
folder <- file.path("GSE173682/scRNA-Seq/", pro)
folder
counts <- Read10X(folder, gene.column = 2)
sce <- CreateSeuratObject(counts, project=pro)
return(sce)
})
names(scRNAlist) <- samples
scRNAlist
# merge
sce.all <- merge(scRNAlist[[1]], y=scRNAlist[-1], add.cell.ids=samples)
sce.all <- JoinLayers(sce.all) # seurat v5
sce.all
# 查看特征
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all$orig.ident)
每个样本对应的信息在 matrix文件中:
# 添加样本表型信息
library(GEOquery)
gse <- "GSE173682"
gset <- getGEO(gse, destdir = './GSE173682/', getGPL = F)
a <- gset[[1]]
## 挑选一些感兴趣的临床表型。
pd <- pData(a)
colnames(pd)
# Patient_1_3533EL_RNA GSM5276933
pd_scrna <- pd[grepl("_RNA",pd$title),c("title","geo_accession","characteristics_ch1.2")]
pd_scrna$characteristics_ch1.2 <- gsub("tumor_site: ", "", pd_scrna$characteristics_ch1.2)
colnames(pd_scrna)[3] <- "tumor_site"
pd_scrna
library(tidyverse)
meta <- sce.all@meta.data %>% rownames_to_column("ID")
meta_new <- merge(meta, pd_scrna, by.x="orig.ident", by.y="geo_accession")
rownames(meta_new) <- meta_new$ID
head(meta_new)
table(meta_new$tumor_site, meta_new$orig.ident)
meta_new <- meta_new[,-2]
sce.all <- AddMetaData(sce.all, metadata = meta_new)
head(sce.all@meta.data)
使用qs保存速度快且文件小:
# 保存
library(qs)
qsave(sce.all, file="GSE173682/scRNA-Seq/sce.all.qs")
这里采用的策略比较简单,对所有样本采用一个统一的阈值进行过滤,过滤前应该看一下每个指标的原始数据分布情况:
rm(list=ls())
library(Seurat)
library(qs)
# 读取数据
sce.all <- qread("GSE173682/scRNA-Seq/sce.all.qs")
sce.all
###### step2: QC质控 ######
## 如果过滤的太狠,就需要去修改这个过滤代码
dir.create("./1-QC")
# 1.计算线粒体基因比例
mito_genes <- grep("^MT-", rownames(sce.all),ignore.case = T, value = T)
# 可能是13个线粒体基因
print(mito_genes)
# sce.all=PercentageFeatureSet(sce.all, "^MT-", col.name = "percent_mito")
sce.all <- PercentageFeatureSet(sce.all, features = mito_genes, col.name = "percent_mito")
fivenum(sce.all@meta.data$percent_mito)
# 2.计算核糖体基因比例
ribo_genes <- grep("^Rp[sl]", rownames(sce.all),ignore.case = T, value = T)
print(ribo_genes)
sce.all <- PercentageFeatureSet(sce.all, features = ribo_genes, col.name = "percent_ribo")
fivenum(sce.all@meta.data$percent_ribo)
# 3.计算红血细胞基因比例
Hb_genes <- grep("^Hb[^(p)]", rownames(sce.all),ignore.case = T,value = T)
print(Hb_genes)
sce.all <- PercentageFeatureSet(sce.all, features=Hb_genes, col.name="percent_hb")
fivenum(sce.all@meta.data$percent_hb)
# 可视化细胞的上述比例情况
# pic1
p1 <- VlnPlot(sce.all, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA","percent_mito", "percent_ribo", "percent_hb"),
pt.size = 0.02, ncol = 5) + NoLegend()
p1
w <- length(unique(sce.all$orig.ident))/1.5+10;w
ggsave(filename="1-QC/Vlnplot1.pdf",plot=p1,width = w,height = 5)
# pic2
p2 <- VlnPlot(sce.all, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA","percent_mito", "percent_ribo", "percent_hb"),
pt.size = 0, ncol = 5) + NoLegend()
p2
w <- length(unique(sce.all$orig.ident))/1.5+10;w
ggsave(filename="1-QC/Vlnplot2.pdf",plot=p2,width = w,height = 5)
# pic3
p3 <- FeatureScatter(sce.all, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5) +
NoLegend()
ggsave(filename="1-QC/Scatterplot.pdf",plot=p3, width = 6, height = 6)
有点的和没点的都看下:
过滤:红细胞含量比较少一般在前面制作细胞悬液的时候会有一个裂红操作就是去掉红细胞,除非你很明确要分析红细胞。线粒体含量比较高,这里是子宫和卵巢组织,这个组织比较复杂,我这里选择比较温和一点的指标:nFeature_rna < 7000, nCount_RNA < 50000, percent_mito < 25%, percent_hb < 1%
## 根据上述指标,过滤低质量细胞/基因
# 过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
# 一般来说,在CreateSeuratObject的时候已经是进行了这个过滤操作
# 如果后期看到了自己的单细胞降维聚类分群结果很诡异,就可以回过头来看质量控制环节
# 先走默认流程即可
dim(sce.all)
# 过滤细胞:细胞中基因表达数少于多少基因 以及 大于多少基因
summary(sce.all$nFeature_RNA)
sce.all.filt <- subset(sce.all, subset = nFeature_RNA > 200 & nFeature_RNA < 7000)
dim(sce.all.filt)
# 过滤细胞:细胞中基因表达count数少于多少 以及 大于多少
summary(sce.all$nCount_RNA)
sce.all.filt <- subset(sce.all.filt, subset = nCount_RNA > 3 & nCount_RNA < 50000)
dim(sce.all.filt)
# 过滤细胞指标2:线粒体/核糖体基因比例/红细胞(根据上面的violin图)
sce.all.filt <- subset(sce.all.filt, subset = percent_mito < 25)
dim(sce.all.filt)
# sce.all.filt <- subset(sce.all.filt, subset = percent_ribo > 3)
# dim(sce.all.filt)
sce.all.filt <- subset(sce.all.filt, subset = percent_hb < 1)
dim(sce.all.filt)
# 过滤后每个样本中的细胞数
stat_raw <- as.data.frame(table(sce.all$orig.ident))
colnames(stat_raw) <- c("sampleid","cellnum_raw")
stat_filter <- as.data.frame(table(sce.all.filt$orig.ident))
colnames(stat_filter) <- c("sampleid","cellnum_filter")
stat <- merge(stat_raw, stat_filter, by="sampleid")
stat$filtered <- stat$cellnum_raw - stat$cellnum_filter
head(stat)
write.table(stat, "1-QC/stat.xls",row.names = F,sep = "\t",quote = F)
# 可视化过滤后的情况
p1_filtered <- VlnPlot(sce.all.filt, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA","percent_mito", "percent_ribo", "percent_hb"),
pt.size = 0.02, ncol = 5) + NoLegend()
p1
w <- length(unique(sce.all.filt$orig.ident))/1.5+10;w
ggsave(filename="1-QC/Vlnplot1_filtered.pdf",plot=p1_filtered,width = w,height = 5)
# pic2
p2_filtered <- VlnPlot(sce.all.filt, group.by = "orig.ident", features = c("nFeature_RNA", "nCount_RNA","percent_mito", "percent_ribo", "percent_hb"),
pt.size = 0, ncol = 5) + NoLegend()
p2_filtered
w <- length(unique(sce.all.filt$orig.ident))/1.5+10;w
ggsave(filename="1-QC/Vlnplot2_filtered.pdf",plot=p2_filtered,width = w,height = 5)
# 保存
library(qs)
qsave(sce.all.filt, file="1-QC/sce.all.filt.qs")
过滤前后数据统计:
sampleid | cellnum_raw | cellnum_filter | filtered | |
---|---|---|---|---|
9 | GSM5276941 | 8984 | 8732 | 252 |
4 | GSM5276936 | 8110 | 7822 | 288 |
1 | GSM5276933 | 5697 | 5370 | 327 |
10 | GSM5276942 | 10094 | 9679 | 415 |
5 | GSM5276937 | 8403 | 7944 | 459 |
3 | GSM5276935 | 6054 | 5487 | 567 |
2 | GSM5276934 | 7963 | 7367 | 596 |
6 | GSM5276938 | 8009 | 7411 | 598 |
11 | GSM5276943 | 6939 | 6320 | 619 |
8 | GSM5276940 | 8181 | 7366 | 815 |
7 | GSM5276939 | 8295 | 5943 | 2352 |
文章中的描述基本上都是默认函数以及参数,标准步骤如下,看一下去批次前的结果:
rm(list=ls())
library(harmony)
library(qs)
library(Seurat)
library(clustree)
###### step3: harmony整合多个单细胞样品 ######
dir.create("2-harmony")
getwd()
# 读取数据
sce.all.filt <- qread("1-QC/sce.all.filt.qs")
# 默认 ScaleData 没有添加"nCount_RNA", "nFeature_RNA"
sce.all.filt
print(dim(sce.all.filt))
head(sce.all.filt@meta.data)
# 走标准流程
sce.all.filt <- NormalizeData(sce.all.filt, normalization.method = "LogNormalize",scale.factor = 1e4)
sce.all.filt <- FindVariableFeatures(sce.all.filt, nfeatures = 2000)
sce.all.filt <- ScaleData(sce.all.filt)
sce.all.filt <- RunPCA(sce.all.filt, features = VariableFeatures(object = sce.all.filt))
# 降维聚类,UMAP降维
sce.all.filt <- FindNeighbors(sce.all.filt, dims = 1:20)
sce.all.filt <- FindClusters(sce.all.filt, resolution = 0.5)
head(Idents(sce.all.filt), 5)
str(sce.all.filt@meta.data)
head(sce.all.filt$RNA_snn_res.0.5)
head(sce.all.filt$seurat_clusters)
sce.all.filt <- RunUMAP(sce.all.filt, dims = 1:20, reduction = "pca")
p <- DimPlot(sce.all.filt, reduction = "umap",label=T,group.by = "orig.ident")
p
ggsave(filename='2-harmony/umap-by-orig.ident-before-harmony.png',plot = p, width = 6,height = 5.5)
可以看到现在的类别基本上是以样本为单位聚在一起,但是也有一些群有不同的样本混合在一起。一般来说,在没有去批次前,不同样本的免疫细胞会比较好的聚在一起,上皮类的则会异质性比较大。对于数据整合与否,大家的争议也比较大,整合往往会不管批次差异还是生物学差异都会一视同仁的去除,还是可以看前面说的曾老板写的那两个非常棒的帖子中的探讨:
我这里走一下常规的harmony,对不同的样本进行整合:
# 运行harmony
sce.all.filt <- RunHarmony(sce.all.filt, "orig.ident")
names(sce.all.filt@reductions)
head(sce.all.filt@meta.data)
# UMAP & TSNE 降维
sce.all.filt <- RunUMAP(sce.all.filt, dims = 1:20, reduction = "harmony")
sce.all.filt <- RunTSNE(sce.all.filt, dims = 1:20, reduction = "harmony")
p <- DimPlot(sce.all.filt, reduction = "umap",label=T,group.by = "orig.ident")
p
ggsave(filename='2-harmony/umap-by-orig.ident-after-harmony.png',plot = p, width = 6,height = 5.5)
harmony 整合后:
运行一下不同resulotions下的分群,后面就可以比较方便的来回调整res以便进行细胞注释:
# 聚类
# 设置不同的分辨率,观察分群效果(选择哪一个?)
sce.all.filt <- FindNeighbors(sce.all.filt, reduction = "harmony", dims = 1:20)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
# graph.name = "CCA_snn",
# sce.all.filt <- FindClusters(sce.all.filt, resolution = res, algorithm = 1)
sce.all.filt <- FindClusters(sce.all.filt, resolution = res)
}
colnames(sce.all.filt@meta.data)
apply(sce.all.filt@meta.data[,grep("RNA_snn",colnames(sce.all.filt@meta.data))],2,table)
# save
p_tree <- clustree(sce.all.filt@meta.data, prefix = "RNA_snn_res.")
ggsave(plot=p_tree, filename="2-harmony/Tree_diff_resolution.pdf", width = 7, height = 10)
p <- DimPlot(sce.all.filt, reduction = "umap", group.by = "RNA_snn_res.0.3",label = T) +
ggtitle("louvain_0.3")
ggsave(plot=p, filename="2-harmony/Dimplot_resolution_0.3.pdf",width = 6, height = 6)
p <- DimPlot(sce.all.filt, reduction = "umap", group.by = "RNA_snn_res.0.1",label = T) +
ggtitle("louvain_0.1")
ggsave(plot=p, filename="2-harmony/Dimplot_resolution_0.1.pdf",width = 6, height = 6)
table(sce.all.filt@active.ident)
# saveRDS(sce.all.filt, file = "2-harmony/sce.all_int.rds")
library(qs)
qsave(sce.all.filt, file="2-harmony/sce.all_int.qs")
(未完待续~)