出发点在于最近的一个小需求。很多情况下用在线数据库的原因是很方便查看一个很小的点,例如生存分析例如共表达例如人群中多少突变比例;虽说这在成型的Figure1代码里也很容易,但是你要用数据去说服别人的时候,还是给别人一个在线数据库更让人信服。
随便一搜,就出来一个小细胞肺癌的芯片数据集,那就来看看这个
哦转换完ID之后发现这个数据集没有我想要的两个基因; 换一个,随便搜了几篇文章里所使用到的数据集: ①GSE6044和GSE40275 ②GSE43346和GSE6044 ③GSE6044和GSE11969 ④GSE40275,GSE1037,GSE44447 综合多个数据集的数据可以看到,DLL3在小细胞肺癌中高表达。
这里只是一个小小的印证猜想的简陋代码
rm(list = ls())
library(GEOquery)
eSet = getGEO("GSE40275", destdir = '.', getGPL = F)
class(eSet)
length(eSet)
eSet = eSet[[1]]
class(eSet)
exp <- exprs(eSet)
dim(exp)
#是否需要log
range(exp)
#exp = log2(exp+1) #需要log才log
#boxplot(exp,las = 2)
pd <- pData(eSet)
p = identical(rownames(pd),colnames(exp));p
if(!p) {
s = intersect(rownames(pd),colnames(exp))
exp = exp[,s]
pd = pd[s,]
}
gpl_number <- eSet@annotation;gpl_number
# 1.Group----
library(stringr)
library(tidyr)
library(tibble)
library(dplyr)
colnames(pd)
k = str_detect(pd$characteristics_ch1.1,"Small|normal");table(k)
pd = pd[k,]
exp = as.data.frame(exp)
exp = exp[,colnames(exp) %in% rownames(pd)]
p = identical(rownames(pd),colnames(exp));p
k = str_detect(pd$source_name_ch1,"normal");table(k)
Group = ifelse(k,"Normal","SCLC")
Group = factor(Group,levels = c("Normal","SCLC"))
Group
#2.探针注释的获取-----------------
library(tinyarray)
find_anno(gpl_number)
#1.加probe_id列,把行名变成一列
get_gpl_txt = function (gpl, destdir = getwd(), download = F)
{
if (!stringr::str_starts(gpl, "GPL|gpl"))
stop("wrong GPL accession")
gpl = stringr::str_to_upper(gpl)
url = paste0("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",
gpl, "&targ=self&form=text&view=data")
message(url)
if (download) {
download.file(url, destfile = paste0(gpl, ".txt"))
message("If the download fails, check that your data is microarray data.")
}
}
get_gpl_txt("GPL15974")
a1=read.delim("GPL15974.txt",comment.char = "#",skip = 1,header = F)
colnames(a1) = a1[2,]
b=a1[-c(1,2),]
library(clusterProfiler)
library(org.Hs.eg.db)
s2e = bitr(b$ENTREZ_GENE_ID,
fromType = "ENTREZID",
toType = "SYMBOL",
OrgDb = org.Hs.eg.db)
ids=merge(b,s2e,by.x="ENTREZ_GENE_ID",by.y="ENTREZID")
ids=ids[,c(2,8)]
colnames(ids)=c("probe_id","symbol")
library(dplyr)
exp = as.data.frame(exp)
dat = mutate(exp,probe_id = rownames(exp))
ids = distinct(ids,symbol,.keep_all = T)
dat = inner_join(dat,ids,by="probe_id")
exprSet2 = avereps(dat[,-c(59:60)],
ID = dat$symbol)
exprSet2[1:5,1:5]
exprSet2 = as.data.frame(exprSet2)
c('DLL3','CD276') %in% rownames(exprSet2)
exp1 = exprSet2['DLL3',]
exp2 = exprSet2['CD276',]
# exp1 = t(as.data.frame(exp1)) #后面很多代码还是用基因×样本的形式作为输入,所以在这里转置一下
# rownames(exp1) = 'SLC17A9'
###DLL3在正常组织和SCLC中的表达
dat = t(exp1) %>%
as.data.frame() %>%
rownames_to_column() %>%
mutate(Group)
dim(dat)
pdat = dat%>%
pivot_longer(cols =(!rowname)&(!Group),
names_to = "gene",
values_to = "count")
library(ggplot2)
library(reshape2)
library(ggbeeswarm)
library(beeswarm)
library(ggsignif)
p <- ggplot(pdat, aes(Group, count)) +
geom_beeswarm(cex = 1.0, show.legend = FALSE,aes(color = Group)) + #cex 可以通过点的尺寸定义宽度
scale_color_manual(values = c('blue', 'red')) + #颜色自定义赋值
theme(panel.grid = element_blank(), panel.background = element_blank(),
axis.line = element_line(color = 'black')) + #去除默认的背景色、边框等
labs(x = '', y = 'DLL3 expression level',title = "GSE40275")+
theme(plot.title = element_text(hjust = 0.5, size = 16))
#添加中位数,上、下四分位数指示线
p1=p + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
geom = 'crossbar', width = 0.35, size = 0.5, color = 'black') +
stat_summary(fun.data = function(x) median_hilow(x, 0.5),
geom = 'errorbar', width = 0.2, color = 'black')
###自己设定对比
compaired <- list( c("Normal", "SCLC"))
###星号,还修改了上面的显著性括号的两侧小竖线长度为0
p2 = p1+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test =t.test,tip_length = 0)
p2
ggsave(filename = 'GSE40275nomol-tumor-boxplot.pdf',width = 9, height = 7)
dev.off()
save(p2,file = "GSE40275nomol_tumor.Rdata")
###CD276在正常组织和SCLC中的表达
dat = t(exp2) %>%
as.data.frame() %>%
rownames_to_column() %>%
mutate(Group)
dim(dat)
pdat = dat%>%
pivot_longer(cols =(!rowname)&(!Group),
names_to = "gene",
values_to = "count")
p <- ggplot(pdat, aes(Group, count)) +
geom_beeswarm(cex = 1.0, show.legend = FALSE,aes(color = Group)) + #cex 可以通过点的尺寸定义宽度
scale_color_manual(values = c('blue', 'red')) + #颜色自定义赋值
theme(panel.grid = element_blank(), panel.background = element_blank(),
axis.line = element_line(color = 'black')) + #去除默认的背景色、边框等
labs(x = '', y = 'B7H3 expression level',title = "GSE43346")+
theme(plot.title = element_text(hjust = 0.5, size = 16))
p1=p + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
geom = 'crossbar', width = 0.35, size = 0.5, color = 'black') +
stat_summary(fun.data = function(x) median_hilow(x, 0.5),
geom = 'errorbar', width = 0.2, color = 'black')
compaired <- list( c("Normal", "SCLC"))
p3 = p1+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test =t.test,tip_length = 0)
p3
colnames(pd)
k = str_detect(pd$source_name_ch1,"small");table(k)
pd_SCLC = pd[k,]
exprSet3 = exprSet2[,colnames(exprSet2) %in% rownames(pd_SCLC)]
p = identical(rownames(pd_SCLC),colnames(exprSet3));p
library("ggstatsplot")
c('DLL3','CD276','SCDO1') %in% rownames(exprSet3)
dat =as.data.frame( t(exprSet3))
cor(dat$DLL3,dat$CD276)
cor = ggscatterstats(data = dat,
y = DLL3,
x = CD276,
centrality.para = "mean",
margins = "both",
xfill = "red",
yfill = "blue",
marginal.type = "histogram",
title = "Relationship"
)
cor
ggsave(filename = 'GSE40275DLL3_B7H3_correlation.pdf',width = 8, height = 7)
p2+p3+cor
ggsave(filename = 'GSE40275.pdf',width = 14, height = 7)
下面三个数据集有两个基因的信息,在GSE40275中可以看到,DLL3和CD276两基因的表达正相关性挺高。诶这里B7H3?奇怪 这个数据出现问题,导致我怀疑这里的相关性准不准了。
这个数据会不会把离群点删掉结果更好?应该也不会,太散乱了
这个数据集GSE44447中就在小细胞肺癌中高表达了,虽然没有显著差异,数据量较少。
在小细胞肺癌中,B7-H3明显是高表达的;且病例分布广和表达比其他靶点高[1-3]。
(B) The negative expression of the B7-H3 protein in normal lung tissue. (C) The positive expression of the B7-H3 protein in adenocarcinoma of the lung.
并且这几个靶点明显没有共表达。
靶向 DLL3、B7H3 的多特异性抗体、CART 疗法和 ADC 药物在治疗晚期末线SCLC 上展现了良好潜力。对比临床数据,DS-7300(B7H3 ADC)vs HS-20093(B7H3 ADC)vs AMG-757(DLL3/CD3 双抗) vs Keytruda(PD-1):ORR(52.4% vs 77.8% vs 40% vs 35.7%),mPFS(5.6m vs NA vs 4.9m vs 2.1m)。 看起来靶向B7H3疗效很不错,虽然可能是叠加了ADC的疗效。(这里我也没有对比B7H3单抗
来源于与瘤伴舞公众号
还找了一个单细胞数据GSE164404来验证看看。
首先下载文件,读取进来
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(dplyr)
library(hdf5r)
library(ggplot2)
library(clustree)
library(cowplot)
library(data.table)
dir='GSE164404_RAW//'
samples=list.files( dir )
samples
sceList = lapply(samples,function(pro){
print(pro)
sce =CreateSeuratObject(counts = Read10X_h5( file.path(dir,pro)) ,
project = gsub('_filtered_feature_bc_matrix.h5','',gsub('^GSM[0-9]*_','',pro) ) ,
min.cells = 5,
min.features = 300 )
return(sce)
})
sceList
samples
sce.all=sceList[[1]]
as.data.frame(sce.all@assays$RNA@layers$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident)
##才五千多个细胞呀
中间省略掉QC和降维聚类…
#######step4: 检查常见分群情况
###### 检查常见分群情况 ######
rm(list=ls())
library(ggplot2)
dir.create("3-cell")
setwd("3-cell")
getwd()
sce.all.int=readRDS("../2-harmony/sce.all_int.rds")
#marker
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A', ## T
'CD19', 'CD79A', 'MS4A1' , ## B
'IGHG1', 'MZB1', 'SDC1', ## Plasma
'CD68', 'CD163', 'CD14',
'VCAN', 'FCN1',
'TPSAB1' , 'TPSB2', # mast cells,
'RCVRN','FPR1' , 'ITGAM' ,
'C1QA', 'C1QB', # mac
'S100A9', 'S100A8', 'MMP19',# monocyte
'MKI67' , 'TOP2A','LYZ', ##myloid
'LAMP3', 'IDO1','IDO2',## DC3
'CD1E','CD1C', # DC2
'CLEC10A', 'CLEC12A','LILRA4', ##pDC
'FGFBP2','NCR1','KLRK1','KLRC1','NCAM1','FCGR3A', # NK
'KIT', ##MAST
'CSF3R','ITGAM','ITGB2','FUT4','FCGR3B','FCGR2A', ##Neutrophil
'FGF7','MME', 'ACTA2','VWF','COL1A1', ## human fibo
'DCN', 'LUM', 'GSN' , ## mouse PDAC fibo
'PECAM1', 'VWF', ## endo
'EPCAM' , 'KRT19','KRT7' # epi
)
sce.all = sce.all.int
library(stringr)
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce.all, features = unique(genes_to_check),
assay='RNA' ) + coord_flip()
p
###看到了觉得奇怪,又去看了眼数据信息
#CD45/CD31-阴性啊!
嘶,就是说,知识不用就,模糊了….
通常我们拿到了肿瘤相关的单细胞转录组的表达量矩阵后的第一层次降维聚类分群通常是:immune (CD45+,PTPRC),epithelial/cancer (EpCAM+,EPCAM),stromal (CD10+,MME,fibro or CD31+,PECAM1,endo)
所以这里就是应该只剩下上皮了
#ggsave('check_last_markers.pdf',height = 11,width = 11)
mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
'#E95C59', '#E59CC4', '#AB3282', '#BD956A',
'#9FA3A8', '#E0D4CA', '#C5DEBA', '#F7F398',
'#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
'#712820', '#DCC1DD', '#CCE0F5', '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
'#968175')
p_umap=DimPlot(sce.all, reduction = "umap",cols = mycolors,pt.size = 0.4,
group.by = "RNA_snn_res.0.5",label = T,label.box = T)
p_umap
library(patchwork)
p+p_umap
ggsave('markers_umap_2.pdf',width = 13,height = 8)
#####step5: 细胞生物学命名
sce.all@meta.data$celltype = 'epithelium'
table(sce.all@meta.data$celltype)
##看看表达状态
features = c('DLL3','CD276','CD274')
FeaturePlot(sce.all,features = features,ncol = 3 )
###相关性分析
exprSet <- sce.all@assays[["RNA"]]@layers$data
exprSet = as.data.frame(exprSet)
exprSet[1:6,1:6]
dim(sce.all)
dim(exprSet)
colnames(sce.all)
rownames(sce.all)
rownames(exprSet) = rownames(sce.all)
dat = exprSet[c('DLL3','CD276'),]
dat<-as.data.frame(t(dat)) #转置
cor_result <- cor(dat, method = "pearson") #method一般默认pearson
plot(dat[,"DLL3"], dat[,"CD276"],
xlab = "DLL3 Expression",
ylab = "CD276 Expression",
main = "Scatter Plot")
ggscatter(dat, x = 'DLL3', y = "CD276",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
add.params = list(color = "red", fill = "lightgray"),
xlab = "DLL3 Expression", ylab = "CD276 Expression")
看起来在小细胞肺癌里还是没有什么相关性的,跟在NSCLC中差不多。
参考文献
1.J Immunother Cancer. 2019 Mar 8;7(1):65.
2.Front Oncol.The Expression of Three Negative Co-Stimulatory B7 Family Molecules in Small Cell Lung Cancer and Their Effect on Prognosis. 2021 Apr 15:11:600238.
3.Transl Oncol.B7-H3/CD276 and small-cell lung cancer: What's new? 2024 Jan:39:101801.