肝癌患者 snRNA-seq 和 scRNA-seq 测序数据是否有区别？

生信菜鸟团

发布于 2022-10-31 17:43:03

6170

发布于 2022-10-31 17:43:03

文章被收录于专栏：生信菜鸟团

这周的推文是对GSE210679数据集进行复现，发现这个数据是由一个snRNA-seq和一个scRNA-seq测序数据组成。

文章：Comparison of single‑nucleus and single‑cell transcriptomes in hepatocellular carcinoma tissue.

文章摘要：单核 RNA 测序 (snRNA ‑seq) 是一种用于分析细胞中基因表达的方法分离是复杂的，例如在肝细胞癌中 (HCC) 组织。它构成了单细胞 RNA 的替代品测序（scRNA-seq）通过分析细胞核而不是整个细胞；但是，是否可以完全替代 HCC 中的 scRNA-seq 仍有待阐明。

在目前的研究中，在肿瘤组织中将 scRNA ‑seq 与 snRNA ‑seq 进行比较使用 10X Genomics 从 HCC 患者中获得。Seurat 也被用来处理数据并比较两个测序之间的差异识别不同细胞类型的方法。在目前的研究中， 14,349 个单核和 9,504 个单细胞的转录组从上述HCC组织中获得。一共 21 个离散细胞簇，包括肝细胞、内皮细胞细胞、成纤维细胞、B细胞、T细胞、自然杀伤细胞和巨噬细胞被识别。值得注意的是，使用 snRNA ‑seq 检测到大量的肝细胞，在scRNA‑seq 检测到增加的免疫细胞。

本研究的结果提供了一个以单细胞分辨率显示人类 HCC 的综合图像。此外，本研究的结果进一步表明，在某些情况下，snRNA ‑seq 可能足以替代 scRNA ‑seq ，snRNA ‑seq 在肝细胞中的表现水平有所提高测序。结合使用两种测序方法可能有助于细胞间相互作用的研究。

数据集：GSE210679

要复现的图：

step1:导入QC后的数据以及降维分群

由于此次数据涉及到scRNA-seq和snRNA-seq测序数据，需要scRNA-seq进行QC后再与snRNA-seq数据进行整合。之前并没有了解过snRNA-seq测序数据，通过处理这组数据去了解了一下，发现该数据不用过滤线粒体核糖体。

dir.create("2-harmony")
getwd()
setwd("2-harmony")

sce.all=readRDS("../1-QC/sce.all_qc.rds")
sce=sce.all.filt 
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce

sce.all=FindClusters(sce.all, #graph.name = "CCA_snn", 
                       resolution = 0.5, algorithm = 1)
#接下来分析，按照分辨率为0.5进行 
sel.clust = "RNA_snn_res.0.5"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")
setwd('../')

step2: marker gene

library(ggplot2) 
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'MKI67','TOP2A','KLRC1',
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'FGF7','MME', 'ACTA2',
                   'PECAM1', 'VWF',    
                   'KLRB1','NCR1', # NK 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1',
                   'MKI67' ,'TOP2A' )
library(stringr)  
genes_to_check=str_to_title(unique(genes_to_check))
genes_to_check
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA'  )  + coord_flip()

p_all_markers
ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf")

p_all_markers
p_umap=DimPlot(sce.all, reduction = "umap", label = T,label.box = T) 
library(patchwork)
p_all_markers+p_umap
ggsave('markers_umap.pdf',width = 15)

step3:细胞亚群生物学命名

 table(sce.all@meta.data$RNA_snn_res.0.5)
  #细胞分群命名
  celltype=data.frame(ClusterID=0:18,
                      celltype= 0:18) 
  #定义细胞亚群  
  celltype[celltype$ClusterID %in% c(0,6,8),2]='T cells'
  celltype[celltype$ClusterID %in% c(15),2]='CD20+ B cells'
  celltype[celltype$ClusterID %in% c(16),2]='plasma B cells'
  
  celltype[celltype$ClusterID %in% c(7,13),2]='myeloid'
  celltype[celltype$ClusterID %in% c(1,2,3,5,14,18),2]='Hepatocyte'
  
  celltype[celltype$ClusterID %in% c(4,17),2]='Endo'
  celltype[celltype$ClusterID %in% c(12),2]='NK'
  
  celltype[celltype$ClusterID %in% c(9,10),2]='fibo'
  celltype[celltype$ClusterID %in% c(11),2]='cycling'
  
  head(celltype)
  celltype
  table(celltype$celltype)
  sce.all@meta.data$celltype = "NA"
  for(i in 1:nrow(celltype)){
    sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.5 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  table(sce.all@meta.data$celltype)
  saveRDS(sce.all, "sce.all_int_celltype.rds")

#papermarker
paper<-"ALB,SERPINA1,HNF4A,EPCAM,CD3D,CD3E,NKG7,CD68,CD14,CD163,
CD1C,CLEC4C,KIT, IGHG1,JCHAIN,CD79A,VWF,PECAM1,
FCGR2B,ACTA2,COL1A1,COL1A2"
papermarker<-str_to_upper(trimws(strsplit(paper,',')[[1]]))
p <- DotPlot(sce.all, features = unique(papermarker),
             assay='RNA'  )  + coord_flip()

p
ggsave(plot=p, filename="check_paper_marker_by_seurat_cluster0.5.pdf")

  DimPlot(sce.all, reduction = "umap",split.by = 'group',
          group.by = "celltype",label = T#,label.box = T
          ) 
  ggsave('group_umap_celltype_group.pdf',width = 14,height = 7)

比较符合总结的scRNA_seq和snRNA_seq的知识点。参考：单细胞核转录组测序 - 简书 (jianshu.com) 虽然snRNA-seq能够获得更加全面完整的细胞类型，但是对于某些细胞类型的获得比例不如scRNA-seq，主要表现为免疫细胞。从两分组细胞分布对比来看snRNA-seq测试数据中 T，B，NK的细胞数量都减少了；而Fibo, Hepatcyte,Endo细胞数量增多。

step4: 细胞亚群比例

# 1.加载R包 ----
rm(list=ls()) 
library(ggplot2) 
library(cowplot) 
library(paletteer)  
library(gplots)
library(ggpubr)    
library(ggsci) 
library(stringr)
# 2.设置工作路径 ----
#在meta信息中添加分组信息
getwd() 
dir.create("../4_group")
setwd("../4_group/")
# 3.读取数据 ----
load(file = '../3-cell/phe-by-markers.Rdata')
phe[1:4,1:4]
table(phe$orig.ident)
table(phe$group) 
sce=readRDS("../3-cell/sce.all_int_celltype.rds")
table(sce@meta.data$group)
# 4.可视化 ----
## 4.1 每种细胞类型中，分组所占比例 ----
library(tidyr)# 使用的gather & spread
library(reshape2) # 使用的函数 melt & dcast 
library(dplyr)
library(ggplot2)
tb=table(phe$celltype,
         phe$group)
balloonplot(tb)
head(tb)
bar_data <- as.data.frame(tb)
head(bar_data)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
write.csv(bar_per,file = "celltype_by_group_percent.csv")

bar_per$group=bar_per$Var2
#可视化
#celltypebygroup
library(ggalluvial)
ggplot(bar_per,
       aes(x = Var1, y=percent,stratum = group, alluvium = percent,
           fill = group, label = group)) +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  geom_flow(stat = "alluvium", lode.guidance = "frontback",
            color = "darkgray") +
  geom_stratum() +
  theme(legend.position = "bottom") +
  #图例位置
  ggtitle("by celltype")

ggsave("celltypebygroup_percent.pdf")

## 每种降维分群中，各个分组所占比例 ----
bar_data <- as.data.frame(table(phe$celltype,phe$orig.ident))

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per)
write.csv(bar_per,file = "celltype_by_orig.ident_percent.csv")

bar_per$group=bar_per$Var2

#在scales包中找了几个颜色
library("scales")
pal_npg("nrc")(10)
#  [1] "#E64B35FF" "#4DBBD5FF" "#00A087FF" "#3C5488FF" "#F39B7FFF" "#8491B4FF"
#  [7] "#91D1C2FF" "#DC0000FF" "#7E6148FF" "#B09C85FF"
 show_col(pal_npg("nrc")(10))

pal_npg("nrc",alpha = 0.6)(10)
 # [1] "#E64B3599" "#4DBBD599" "#00A08799" "#3C548899" "#F39B7F99" "#8491B499"
 # [7] "#91D1C299" "#DC000099" "#7E614899" "#B09C8599"

bar_data <- as.data.frame(table(phe$seurat_clusters,phe$group))
bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per)

bar_per$group=bar_per$Var2
#换了一个画法
mycolor = c("#E64B3599",
            "#4DBBD5FF")
ggplot(bar_per, aes(x =Var1, y= percent, fill = group,
                    stratum=group, alluvium=group)) + 
  geom_col(width = 0.5, color='black')+
  geom_flow(width=0.5,alpha=0.4, knot.pos=0.5)+
  theme_classic() +
  labs(x='sample',y ='Ratio',title = "by cluster"
  )+
  coord_flip()+
  scale_fill_manual(values = mycolor)#+  
 # stat_compare_means(aes(group=group),
   #                  symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1),
    #                  symbols = c("***", "**", "*", "NS")),label = "p.signif",
     #                label.y = 1.2,size = 5)

ggsave("clusterbygroup_percent.pdf")

#如果对其进行统计检验
+stat_compare_means(aes(group=group),
                     symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                     symbols = c("***", "**", "*", "NS")),label = "p.signif",
                    label.y = 1.2,size = 5)