单细胞测序—标准流程代码(2) — 标记基因与细胞注释

原创

sheldor没耳朵

发布于 2024-08-22 15:08:41

9530

文章被收录于专栏：单细胞测序单细胞测序

单细胞测序—标准流程代码(2) — 标记基因与细胞注释

书接上回，已经做好数据质控、过滤、去批次、降维聚类分群后，接下来就是进行细胞注释方面的工作

step4: 看标记基因库

# 原则上分辨率是需要自己肉眼判断，取决于个人经验
# 为了省力，我们直接看  0.1和0.8即可

table(Idents(sce.all.int))
   0    1    2    3    4    5 
5241 4859 1365 1261 1062  138 
table(sce.all.int$seurat_clusters)
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
2460 2200 1998 1878 1061  936  690  635  598  511  428  241  139   92   59 
table(sce.all.int$RNA_snn_res.0.1) 
   0    1    2    3    4    5 
5241 4859 1365 1261 1062  138 
table(sce.all.int$RNA_snn_res.0.8)  
getwd()

#选择分辨率为0.1的分群进行标记
dir.create('check-by-0.1')
setwd('check-by-0.1')
sel.clust = "RNA_snn_res.0.1"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident) 
source('../scRNA_scripts/check-all-markers.R')
setwd('../') 
getwd()

#选择分辨率为0.8的分群进行标记
dir.create('check-by-0.8')
setwd('check-by-0.8')
sel.clust = "RNA_snn_res.0.8"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident) 
source('../scRNA_scripts/check-all-markers.R')
setwd('../') 
getwd()

last_markers_to_check

4.1 check-all-markers.R

# 代码需要保证统一
# 只能是 sce.all
gastric_cancer_markers = c('PTPRC', 
                   'MUC2' , 'ITLN1',
                   'FABP1' , 'APOA1',
                   'CEACAM5' , 'CEACAM6',
                   'EPCAM', 'KRT18', 'MUC1',
                   'MUC6' , 'TFF2',
                   'PGA4' , 'PGA3',
                   'MUC5AC' , 'TFF1','CHGA' , 'CHGB') 
Myo=c("Krt17", "Krt14", "Krt5", "Acta2", "Myl9", "Mylk", "Myh11")
Lum=c("Krt19", "Krt18", "Krt8")
Hs=c("Prlr", "Cited1", "Pgr", "Prom1", "Esr1")  
AV=c("Mfge8", "Trf", "Csn3", "Wfdc18", "Elf5", "Ltf")
Lp=c("Kit", "Aldh1a3", "Cd14")
Fib=c("Col1a1", "Col1a2", "Col3a1", "Fn1")
GSE150580_breast_cancer_markers_list =list(
  Myo=Myo,
  Lum=Lum,
  Hs=Hs, 
  AV=AV,
  Lp=Lp,  
  Fib=Fib 
  
) 

# macrophages (Adgre1, Cd14, and Fcgr3),
# cDCs (Xcr1, Flt3, and Ccr7),
# pDCs (Siglech, Clec10a, and Clec12a), 
# monocytes (Ly6c2 and Spn), 
# neutrophils (Csf3r, S100a8, and Cxcl3),
macrophages=c('Adgre1', 'Cd14',  'Fcgr3')
cDCs=c('Xcr1', 'Flt3',  'Ccr7')
pDCs=c('Siglech', 'Clec10a',  'Clec12a')  
monocytes=c('Ly6c2' , 'Spn')
neutrophils=c('Csf3r', 'S100a8',  'Cxcl3') 
SCP1661_meyloids_markers_list =list(
  macrophages=macrophages,
  cDCs=cDCs,
  pDCs=pDCs, 
  monocytes=monocytes,
  neutrophils=neutrophils  
) 
 
lung_epi_markers =  c('TPPP3',"SPRR3","GDPD3","SPRR1A","SPRR2A","RARRES2","TMPRSS11E",
                    "ASCL3","CFTR","FOXI2","1SG20","FOXI1",
                    "SAA4","SAA2","EFHC1","CCDC153","CCDC113","SAA1","CDC20B","FOXJ1",
                    "MYCL","FOXN4","CCNO",
                    "PIGR","BP1","MUC5A","VMO1","SCGB3A1","CYP2A13","CYP2B6","SCGB1A1",
                    "BCAM","KRT15","KRT5","TP63")

myeloids_markers_list1 =list(
  CM=c("TTN","MYH7","MYH6","TNNT2") ,
  EC=c("VWF", "IFI27", "PECAM1","MGP"),
  FB=c("DCN", "C7" ,"LUM","FBLN1","COL1A2"),
  MP=c("CD163", "CCL4", "CXCL8","PTPRC"),
  SMC=c("ACTA2", "CALD1", "MYH11"),
  Tc=c("CD3D","CD3E"),
  DC1 = c( 'Clec9a', 'Xcr1',   'Wdfy4'), 
  DC2 = c('Itgax', 'Sirpa',   'Cd209a'), 
  mregDCs= c('Ccr7', 'Cd80', 'Cd200',   'Cd247') ,
  hypoxia=c('Hif1a', 'Slc2a1', 'Vegfa', 'Hmox1', 
            'Bnip3', 'Nos2', 'Mmp2', 'Sod3', 
            'Cited2', 'Ldha'),
  peric=c("ABCC9","PDGFRB","RGS5")
)

myeloids_markers_list2 = list(pDC = c("CLEC4C","IRF7","TCF4","GZMB"),
                      cDC1 = c("XCR1","CLNK","CLEC9A"),
                      cDC2 = c("FCER1A","HLA-DPB1","HLA-DQB1","CD1E","CD1C","CLEC10A","HLA-DQA2"),
                      DC3 = c("CCL19","LAMP3","IDO1","IDO2","LAD1","FSCN1","CCR7","LY75","CCL22","CD40","BIRC3","NFKB2"),
                      Macrophages = c("APOC1","HLA-DRB5","C1QA","C1QB"),
                      RTMs = c("THBS1"),#Resident tissue macrophages
                      Lam = c("APOE"),#Lipid associated macrophages
                      Monocytes = c("LYZ","HLA-DRB1","TIMP1","S100A11","CXCL8","IL1B","PTGS2","S100A9","S100A8","MMP19"),
                      Mono_C = c('CD14'),#Mono_CD14
                      Mono_F = c('FCGR3A'),#Mono_FCGR3A
                      Mast = c('TPSAB1' , 'TPSB2'))
 

Tcells_markers = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CCR7', 'SELL' , 'TCF7','CXCR6' , 'ITGA1',
                   'FOXP3', 'IL2RA',  'CTLA4','GZMB', 'GZMK','CCL5',
                   'IFNG', 'CCL4', 'CCL3' ,
                   'PRF1' , 'NKG7') 
###CD4T
CD4_markers_list =list(
  Tc=c("CD3D","CD3E"),
  CD4=c("CD4" ),
  Treg=c("TNFRSF4","BATF","TNFRSF18","FOXP3","IL2RA","IKZF2"),
  naive=c("CCR7","SELL","CD5"),
  Tfh=c("CXCR5","BCL6","ICA1","TOX","TOX2","IL6ST"),#滤泡辅助性T细胞
  ILC=c("TNFRSF25","KRT81","LST1","AREG","LTB","CD69")
) 

###CD8T
CD8_markers_list1 =list(
  CD8=c("CD8A","CD8B"),
  TN_TCM=c("CCR7","SELL","TCF7","LEF1"),
  TEM=c("GZMK"  ),
  TEFF=c("TBX21","FCGR3A","FGFBP2"),
  TRM=c("XCL1","XCL2","ITGAE","CD69"),
  IEL_T = c("TMIGD2"),
  yT1c=c("GNLY","PTGDS","GZMB","TRDC"),
  yT2c=c("TMN1","HMGB2","TYMS"),
  MAIT_T = c("SLC4A10")
) 
CD8_markers_list2 =list(
  CD8T=c("CD8A","CD8B"),
  MAIT=c("ZBTB16","NCR3","RORA"),
  ExhaustedCD8T=c("LAG3","TIGIT","PDCD1","HAVCR2","CTLA4"),
  EffMemoryCD8=c("EOMES","ITM2C"),
  Resting_NK=c("XCL1","XCL2","KLRC1"),
  Cytotoxic_NK=c("CX3CR1","FGFBP2","FCGR3A","KLRD1"),
  Pre_exhausted=c("IFNG","PRF1","GNLY","GZMA","NKG7","GZMK")
)
 
cd4_and_cd8T_markers_list  =list( 
  naive=c("CCR7","SELL","TCF7","IL7R","CD27","CD28","LEF1","S1PR1"),
  CD8Trm=c("XCL1","XCL2","MYADM"),
  NKTc=c("GNLY","GZMA"), 
  Tfh=c("CXCR5","BCL6","ICA1","TOX","TOX2","IL6ST"),
  th17=c("IL17A","KLRB1","CCL20","ANKRD28","IL23R","RORC","FURIN","CCR6","CAPG","IL22"),
  CD8Tem=c("CXCR4","GZMH","CD44","GZMK"),
  Treg=c("FOXP3","IL2RA","TNFRSF18","IKZF2"),
  naive=c("CCR7","SELL","TCF7","IL7R","CD27","CD28"),
  CD8Trm=c("XCL1","XCL2","MYADM"), 
  MAIT=c("KLRB1","ZBTB16","NCR3","RORA"),
  yT1c=c("GNLY","PTGDS","GZMB","TRDC"),
  yT2c=c("TMN1","HMGB2","TYMS"),
  yt=c("TRGV9","TRDV2")
) 


# CD20 (MS4A1)表达于除plasma B 之外的所有B，很关键的区分naive 和plasma的marker
# SDC1 = CD138 plasma B （接受抗原，可表达抗体） 
Bcels_markers_list = list(
  All = c('MS4A1','SDC1','CD27','CD38','CD19', 'CD79A'),
  GC_B = c('IL4R','TCL1A','LRMP','SUGCT'),
  IGA_plasm_B= c ( 'IGHA1'), 
  IGG_plasm_B= c ( 'IGHG1')
)  

Hepatic_stellate_markers_list =list(
  qHSC=c("Lrat","Ecm1","Angptl6","Vipr1" ),
  S1=c("Ccl2" ,"Cxcl10" ,"Cxcl1" ,"Ccl7" ),
  S2=c("Acta2" ,"Tpm1" ,"Vim" ,"Tagln","Tnc","Tpm2"),
  S3=c("Col1a1","Col1a2","Col3a1" ,"Lox","Lum" )
)
  
# arteries (HEY1, IGFBP3), capillaries (CD36, CA4), veins (ACKR1) and
# lymphatic ECs (LECs; CCL21, PROX1). 
stromal_markers = c('TEK',"PTPRC","EPCAM","PDPN",
                   "PECAM1",'PDGFRB',"PLVAP",'PROX1','ACKR1','CA4','HEY1',
                   'CSPG4','GJB2', 'RGS5','ITGA7',
                   'ACTA2','RBP1','CD36', 
                   'ADGRE5','COL11A1','FGF7', 'MME') 

last_markers = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## human  fibo 
                 'GJB2', 'RGS5',
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                 "PLVAP",'PROX1','ACKR1','CA4','HEY1',
                   'EPCAM' , 'KRT19','KRT7', # epi 
                   'FYXD2', 'TM4SF4', 'ANXA4',# cholangiocytes
                   'APOC3', 'FABP1',  'APOA1',  # hepatocytes
                   'Serpina1c',
                   'PROM1', 'ALDH1A1' )

gastric_cancer_markers 
lung_epi_markers
Tcells_markers
stromal_markers 
last_markers 

GSE150580_breast_cancer_markers_list 
SCP1661_meyloids_markers_list 
myeloids_markers_list1 
myeloids_markers_list2 
CD4_markers_list 
CD8_markers_list1 
CD8_markers_list2 
cd4_and_cd8T_markers_list   
Bcels_markers_list 
Hepatic_stellate_markers_list 


markers = c('gastric_cancer_markers','lung_epi_markers',
            'Tcells_markers',
            'stromal_markers', 
            'last_markers' )
markers_list <- c(
  'GSE150580_breast_cancer_markers_list' ,
  'SCP1661_meyloids_markers_list' ,
  'myeloids_markers_list1' ,
  'myeloids_markers_list2' ,
  'CD4_markers_list' ,
  'CD8_markers_list1' ,
  'CD8_markers_list2' ,
  'cd4_and_cd8T_markers_list'   ,
  'Bcels_markers_list' ,
  'Hepatic_stellate_markers_list' 
)

p_umap=DimPlot(sce.all.int, reduction = "umap",raster = F,
               label = T,repel = T) 
p_umap 

if(sp=='human'){
   lapply(markers, function(x){
     #x=markers[1]
     genes_to_check=str_to_upper(get(x)) 
     DotPlot(sce.all.int , features = genes_to_check )  + 
       coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     h=length( genes_to_check )/6+3;h
     ggsave(paste('check_for_',x,'.pdf'),height = h)
   })
  lapply(markers_list, function(x){
    # x=markers_list[1]
    genes_to_check = lapply(get(x), str_to_upper)
    dup=names(table(unlist(genes_to_check)))[table(unlist(genes_to_check))>1]
    genes_to_check = lapply(genes_to_check, function(x) x[!x %in% dup])
  
    DotPlot(sce.all.int , features = genes_to_check )  + 
     # coord_flip() + 
      theme(axis.text.x=element_text(angle=45,hjust = 1))
    
    w=length( unique(unlist(genes_to_check)) )/5+6;w
    ggsave(paste('check_for_',x,'.pdf'),width  = w)
  })
  
  last_markers_to_check <<- str_to_upper(last_markers ) 

 }else if(sp=='mouse'){
   lapply(markers, function(x){
     #x=markers[1]
     genes_to_check=str_to_title(get(x)) 
     DotPlot(sce.all.int , features = genes_to_check )  + 
       coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     h=length( genes_to_check )/6+3;h
     ggsave(paste('check_for_',x,'.pdf'),height = h)
   })
   lapply(markers_list, function(x){
     # x=markers_list[1]
     genes_to_check = lapply(get(x), str_to_title)
     dup=names(table(unlist(genes_to_check)))[table(unlist(genes_to_check))>1]
     genes_to_check = lapply(genes_to_check, function(x) x[!x %in% dup])
     
     DotPlot(sce.all.int , features = genes_to_check )  + 
       # coord_flip() + 
       theme(axis.text.x=element_text(angle=45,hjust = 1))
     
     w=length( unique(unlist(genes_to_check)) )/5+6;w
     ggsave(paste('check_for_',x,'.pdf'),width  = w)
   })
   
   last_markers_to_check <<- str_to_title(last_markers ) 
}else {
  print('we only accept human or mouse')
} 

p_all_markers = DotPlot(sce.all.int , features = last_markers_to_check )  + 
  coord_flip() + 
  theme(axis.text.x=element_text(angle=45,hjust = 1)) 
p_all_markers+p_umap
h=length( last_markers_to_check )/6+3;h
w=length( unique( Idents(sce.all.int)) )/5+10;w
ggsave(paste('last_markers_and_umap.pdf'),width  = w,height = h)

pro = 'qc-'
if("percent_mito" %in% colnames(sce.all.int@meta.data ) ){

  #可视化细胞的上述比例情况
  feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
  
  feats <- c("nFeature_RNA", "nCount_RNA")
  p1=VlnPlot(sce.all.int , features = feats, pt.size = 0, ncol = 2) + 
    NoLegend()
  w=length(unique(sce.all.int$orig.ident))/3+5;w
  ggsave(filename=paste0(pro,"Vlnplot1.pdf"),plot=p1,width = w,height = 5)
  
  feats <- c("percent_mito", "percent_ribo", "percent_hb")
  p2=VlnPlot(sce.all.int,  features = feats, pt.size = 0, ncol = 3, same.y.lims=T) + 
    scale_y_continuous(breaks=seq(0, 100, 5)) +
    NoLegend()
  w=length(unique(sce.all.int$orig.ident))/2+5;w
  ggsave(filename=paste0(pro,"Vlnplot2.pdf"),plot=p2,width = w,height = 5)
  
}
p3=FeatureScatter(sce.all.int , "nCount_RNA", "nFeature_RNA", 
                  pt.size = 0.5)
ggsave(filename=paste0(pro,"Scatterplot.pdf"),plot=p3)


if(T){
  #  remotes::install_github('genecell/COSGR')
  #  genexcell <- Seurat::GetAssayData(object = object[[assay]],slot = slot)
  marker_cosg <- cosg(
    sce.all.int,
    groups='all',
    assay='RNA',
    slot='data',
    mu=1,
    n_genes_user=100)
  
  save(marker_cosg,file = paste0(pro,'_marker_cosg.Rdata'))
  head(marker_cosg)
  
  ## Top10 genes
  library(dplyr)  
  top_10 <- unique(as.character(apply(marker_cosg$names,2,head,10)))
  # width <-0.006*dim(sce.all.int)[2];width
  # height <- 0.25*length(top_10)+4.5;height
  
  width <- 15+0.5*length(unique(Idents(sce.all.int)));width
  height <- 8+0.1*length(top_10);height
  
  sce.Scale <- ScaleData(sce.all.int ,features =  top_10  )  
  
  DoHeatmap(  sce.Scale , top_10 , 
              size=3)
  
  ggsave(filename=paste0(pro,'DoHeatmap_check_top10_markers_by_clusters.pdf') ,
         # limitsize = FALSE,
         units = "cm",width=width,height=height)
  width <- 8+0.6*length(unique(Idents(sce.all.int)));width
  height <- 8+0.2*length(top_10);height
  DotPlot(sce.all.int, features = top_10 ,
          assay='RNA'  )  + coord_flip() +FontSize(y.text = 4)
  ggsave(paste0(pro,'DotPlot_check_top10_markers_by_clusters.pdf'),
         units = "cm",width=width,height=height)
  
  
  ## Top3 genes
  top_3 <- unique(as.character(apply(marker_cosg$names,2,head,3)))
  
  width <- 15+0.2*length(unique(Idents(sce.all.int)));width
  height <- 8+0.1*length(top_3);height
  
  sce.Scale <- ScaleData(sce.all.int ,features =  top_3  )  
  
  DoHeatmap(  sce.Scale , top_3 , 
              size=3)
  ggsave(filename=paste0(pro,'DoHeatmap_check_top3_markers_by_clusters.pdf') ,
         units = "cm",width=width,height=height)
  
  width <- 8+0.2*length(unique(Idents(sce.all.int)));width
  height <- 8+0.1*length(top_3);height
  DotPlot(sce.all.int, features = top_3 ,
          assay='RNA'  )  + coord_flip()
  ggsave(paste0(pro,'DotPlot_check_top3_markers_by_clusters.pdf'),width=width,height=height)
  
}

4.2 标记基因总结

gastric_cancer_markers（胃癌标记基因）：

这个列表包含了与胃癌相关的标记基因，用于识别或研究胃癌细胞。

lung_epi_markers（肺上皮细胞标记基因）：

这个列表包含了与肺上皮细胞相关的标记基因，可能用于研究肺组织或肺癌中的上皮细胞。

Tcells_markers（T细胞标记基因）：

这个列表包含了与T细胞相关的标记基因，T细胞是免疫系统中的一种关键细胞类型，参与适应性免疫反应。

stromal_markers（基质细胞标记基因）：

这个列表包含了与基质细胞相关的标记基因，基质细胞构成组织中的支持性框架，参与组织结构和信号传导。

last_markers（最后标记基因）：

这个名称看起来比较通用，可能是指某个研究中最后一组基因标记。

GSE150580_breast_cancer_markers_list（乳腺癌标记基因列表）：

如前所述，这个列表包含了与乳腺癌相关的多种细胞类型的标记基因，用于分析乳腺癌中的不同细胞。

SCP1661_meyloids_markers_list（髓系细胞标记基因列表）：

这个列表包含了与髓系细胞相关的标记基因，用于研究髓系细胞的功能和特性。

myeloids_markers_list1 和 myeloids_markers_list2（髓系细胞标记基因列表1和2）：

这两个列表可能包含了不同髓系细胞亚群的标记基因，分别用于研究这些亚群在特定条件或研究中的表现。

CD4_markers_list（CD4+ T细胞标记基因列表）：

这个列表包含了与CD4+ T细胞相关的标记基因，这些细胞在免疫应答中起重要作用。

CD8_markers_list1 和 CD8_markers_list2（CD8+ T细胞标记基因列表1和2）：

这两个列表包含了与CD8+ T细胞相关的标记基因，可能代表不同亚群的CD8+ T细胞，研究这些细胞在免疫反应中的特性。

cd4_and_cd8T_markers_list（CD4+ 和 CD8+ T细胞标记基因列表）：

这个列表可能包含了同时与CD4+ T细胞和CD8+ T细胞相关的基因，用于比较或综合分析这两种细胞类型。

Bcels_markers_list（B细胞标记基因列表）：

这个列表包含了与B细胞相关的标记基因，B细胞是免疫系统中产生抗体的细胞。

Hepatic_stellate_markers_list（肝星状细胞标记基因列表）：

这个列表包含了与肝星状细胞相关的标记基因，这些细胞在肝脏纤维化和其他肝脏病理过程中起重要作用。

这些基因标记列表可以在各种生物医学研究中帮助识别和分析不同的细胞类型及其在疾病中的作用

4.3 代码解释

p_umap=DimPlot(sce.all.int, reduction = "umap",raster = F,
               label = T,repel = T)

reduction = "umap": 指定使用UMAP进行降维。

raster = F: 禁用栅格化，使用矢量图形。

label = T: 在图中标记每个细胞群的标签。

repel = T: 避免标签重叠。

if(sp=='human'){
    lapply(markers, function(x){
      genes_to_check=str_to_upper(get(x)) 
      DotPlot(sce.all.int , features = genes_to_check )  + 
        coord_flip() + 
        theme(axis.text.x=element_text(angle=45,hjust = 1))
      
      h=length( genes_to_check )/6+3;h
      ggsave(paste('check_for_',x,'.pdf'),height = h)
    })
    
 ....

lapply 函数对 markers 列表中的每个元素执行指定操作。
genes_to_check=str_to_upper(get(x))

将当前标记列表 x 中的所有基因名称转换为大写，以便与人类基因名称匹配。

分别对markers、markers_list列表中的基因进行可视化的操作。

last_markers_to_check <<- str_to_upper(last_markers )

将变量 last_markers 中的所有基因名称转换为大写，并将结果存储在一个全局变量 last_markers_to_check中。

注：

<<- 是 R 语言中的一种赋值操作符，用于在函数内部将值赋给全局变量或父级作用域中的变量。与常规的 <- 不同，<<- 可以改变当前函数环境之外的变量。

else if(sp=='mouse'){
   lapply(markers, function(x){
     genes_to_check=str_to_title(get(x)
     ...
   }
   lapply(markers_list, function(x){
     genes_to_check = lapply(get(x), str_to_title)
     ...
   }
   
   last_markers_to_check <<- str_to_title(last_markers )

与human不同，小鼠的基因名需要首字母大写

基因名称转换为首字母大写（str_to_title），因为小鼠基因的标准命名方式是首字母大写。

  marker_cosg <- cosg(
    sce.all.int,
    groups='all',
    assay='RNA',
    slot='data',
    mu=1,
    n_genes_user=100)
  
  save(marker_cosg,file = paste0(pro,'_marker_cosg.Rdata'))
  head(marker_cosg)

cosg函数用于识别细胞群体的特征基因（marker基因）。
marker_cosg 是一个保存结果的变量，包含识别到的marker基因。

sce.all.int: 输入的单细胞对象（SingleCellExperiment 或 Seurat 对象），包含了所有细胞的数据。

groups = all: 指定要对所有群体进行分析，即识别所有群体的marker基因。

assay = 'RNA': 指定使用RNA测序数据进行分析。

slot = 'data': 指定从数据的哪个部分提取表达值，这里选择 data插槽，通常包含标准化后的表达数据。

mu = 1: 参数 mu可能控制分析过程中的一个参数，具体含义取决于 cosg 函数的实现。通常可能与某种筛选标准或过滤条件有关。

n_genes_user = 100: 指定要识别的每个群体的marker基因数量，这里设置为100个。

## Top10 genes
  library(dplyr)  
  top_10 <- unique(as.character(apply(marker_cosg$names,2,head,10))) 
  width <- 15+0.5*length(unique(Idents(sce.all.int)));width
  height <- 8+0.1*length(top_10);height
  
  sce.Scale <- ScaleData(sce.all.int ,features =  top_10  )  
  DoHeatmap(  sce.Scale , top_10 , 
              size=3)
  
  ggsave(filename=paste0(pro,'DoHeatmap_check_top10_markers_by_clusters.pdf') ,
         units = "cm",width=width,height=height)
  width <- 8+0.6*length(unique(Idents(sce.all.int)));width
  height <- 8+0.2*length(top_10);height
  DotPlot(sce.all.int, features = top_10 ,
          assay='RNA'  )  + coord_flip() +FontSize(y.text = 4)
  ggsave(paste0(pro,'DotPlot_check_top10_markers_by_clusters.pdf'),
         units = "cm",width=width,height=height)

## Top3 genes

...

使用 DoHeatmap 函数绘制热图，显示 top_10 中基因在不同细胞群体中的表达情况。

使用 DotPlot 函数绘制点图，显示 top_10 基因在不同细胞群体中的表达情况。

4.4 总结

以下代码均是分辨率为0.1的运行结果，其他分辨率类似

dir.create('check-by-0.1')
setwd('check-by-0.1')
sel.clust = "RNA_snn_res.0.1"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident) 
source('../scRNA_scripts/check-all-markers.R')

即这一步会生成21张图

前8张为各个基因list中，在各个细胞分群中的表达情况：

如

checkfor Tcells_markers .pdf

checkfor myeloids_markers_list2 .pdf

后面的last_markers_and_umap.pdf同时可视化了umap降维图，和last_markers基因列表（包含多数细胞类型的特征基因）中在各个分群中的表达情况。

last_markers_and_umap.pdf

后面的四张图是可视化了当前分辨率中各个分群中的top10基因的热图和点图。

qc-DoHeatmap_check_top10_markers_by_clusters.pdf

qc-DotPlot_check_top10_markers_by_clusters.pdf

最后3张是可视化了当前分群中的nFeature、nCount、线粒体核糖体血红细胞基因的分布、以及相关性图。

qc-Vlnplot1.pdf

qc-Vlnplot2.pdf

qc-Scatterplot.pdf

step5: 确定单细胞亚群生物学名字

一般来说，为了节省工作量，我们选择0.1的分辨率进行命名

因为命名这个步骤是纯人工操作

除非0.1确实分群太粗狂了，我们就选择0.8

根据上述图片手动给细胞分群命名

这里因为用的是测试数据集，其自带细胞注释。因此用气泡图比较了本身和手动注释的区别。

5.1 比较注释间差别（可选）

source('scRNA_scripts/lib.R')
sce.all.int = readRDS('2-harmony/sce.all_int.rds')
sp='human'
colnames(sce.all.int@meta.data) 
table(sce.all.int$RNA_snn_res.0.1)
# pbmc_small <- BuildClusterTree(object = sce.all.int)
# plot(Tool(object = pbmc_small, slot = 'BuildClusterTree'))
# plot(pbmc_small@tools$BuildClusterTree)

# 如果是手动给各个单细胞亚群命名
if(F){
  sce.all.int
  celltype=data.frame(ClusterID=0:5 ,
                      celltype= 0:5) 
  #定义细胞亚群        
  celltype[celltype$ClusterID %in% c( 1 ,4 ),2]='mono'  
  celltype[celltype$ClusterID %in% c( 0,3 ),2]='Tcells' 
  celltype[celltype$ClusterID %in% c( 2 ),2]='Bcells'
  celltype[celltype$ClusterID %in% c( 5 ),2]='pDC'
  
  head(celltype)
  celltype
  table(celltype$celltype)
  sce.all.int@meta.data$celltype = "NA"
  
  for(i in 1:nrow(celltype)){
    sce.all.int@meta.data[which(sce.all.int@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  Idents(sce.all.int)=sce.all.int$celltype
  
  
}

table(sce.all.int$seurat_annotations)
#比较和数据集自带的细胞分群的差别
gplots::balloonplot(
  table(sce.all.int$RNA_snn_res.0.8,
        sce.all.int$seurat_annotations)
)

gplots::balloonplot(
  table(sce.all.int$celltype,
        sce.all.int$seurat_annotations)
)

注：

sce.all.int = readRDS('2-harmony/sce.all_int.rds')

从文件 sce.all_int.rds 中读取 SingleCellExperiment 对象并存储在变量 sce.all.int中。该对象包含了所有细胞的表达数据以及相关的元数据。

5.2手动注释部分

celltype=data.frame(ClusterID=0:5 ,
                      celltype= 0:5) 
      
celltype[celltype$ClusterID %in% c( 1 ,4 ),2]='mono'  
celltype[celltype$ClusterID %in% c( 0,3 ),2]='Tcells' 
celltype[celltype$ClusterID %in% c( 2 ),2]='Bcells'
celltype[celltype$ClusterID %in% c( 5 ),2]='pDC'

这个步骤创建了一个名为 celltype 的数据框，包含两列：

ClusterID: 表示聚类的ID，这里是0到5的整数。
celltype: 初始化时与 ClusterID 相同，也填充为0到5的整数

这里的0～5需要根据当前分辨率的细胞分群修改对应的数

#初始化celltype列
sce.all.int@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
    sce.all.int@meta.data[which(sce.all.int@meta.data$RNA_snn_res.0.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  Idents(sce.all.int)=sce.all.int$celltype

为 sce.all.int 对象的元数据（meta.data）添加一个新的 celltype 列，并将其初始值设置为 "NA"。
使用一个 for循环遍历 celltype数据框的每一行：

which()函数返回满足条件的索引位置，即返回 TRUE 的位置。这个表达式会找到 sce.all.int对象中 RNA_snn_res.0.1 列中等于当前 ClusterID 的所有行索引。

sce.all.int@meta.datawhich(...),'celltype'

对选择出的那些行中的 celltype列进行操作。

sce.all.int@meta.datawhich(...),'celltype' <- celltype$celltypei

将特定聚类 ID 对应的细胞的 celltype 列更新为相应的细胞类型。

Idents(sce.all.int)是Seurat包中用于标识细胞身份的函数。将细胞的身份标识符更新为刚刚分配的 celltype，从而可以在后续分析和绘图中使用这些细胞类型标签。

这里手动注释还是比较粗糙的，为了方便显示，还是使用数据集本身自带的细胞注释进行后续的分析

#DimPlot(sce.all.int, reduction = "umap",raster = F,group.by = 'RNA_snn_res.0.1',
#       label = T,repel = T) +
# DimPlot(sce.all.int, reduction = "umap",raster = F,group.by = 'seurat_annotations',
#          label = T,repel = T) 

# 如果前面成功的给各个细胞亚群命名了
# 就可以运行下面的代码
sce.all.int$celltype = as.character(sce.all.int$seurat_annotations)
if("celltype" %in% colnames(sce.all.int@meta.data ) ){
  
  sel.clust = "celltype"
  sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
  table(sce.all.int@active.ident) 
  
  dir.create('check-by-celltype')
  setwd('check-by-celltype')
  source('../scRNA_scripts/check-all-markers.R')
  setwd('../') 
  getwd()
  phe=sce.all.int@meta.data
  save(phe,file = 'phe.Rdata')
  pdf('celltype-vs-orig.ident.pdf',width = 10)
  gplots::balloonplot(table(sce.all.int$orig.ident,
                            sce.all.int$celltype))
  dev.off()
}

同样再跑下check-all-markers.R这个脚本，会出21张图。如

qc-Vlnplot1.pdf

qc-DoHeatmap_check_top3_markers_by_clusters.pdf

qc-DotPlot_check_top3_markers_by_clusters.pdf

last_markers_and_umap.pdf

最后

 pdf('celltype-vs-orig.ident.pdf',width = 10)
  gplots::balloonplot(table(sce.all.int$orig.ident,
                            sce.all.int$celltype))

展示了每个细胞群中在多分组中的分布情况

celltype-vs-orig.ident.pdf

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

数据挖掘

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

数据挖掘

#单细胞测序

#标准流程

登录后参与评论

0 条评论

热度

单细胞测序—标准流程代码(2) — 标记基因与细胞注释

单细胞测序—标准流程代码(2) — 标记基因与细胞注释

单细胞测序—标准流程代码(2) — 标记基因与细胞注释

step4: 看标记基因库

4.1 check-all-markers.R

4.2 标记基因总结

4.3 代码解释

4.4 总结

step5: 确定单细胞亚群生物学名字

5.1 比较注释间差别（可选）

5.2手动注释部分

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐