前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >SCLC某两个基因共表达情况

SCLC某两个基因共表达情况

作者头像
生信菜鸟团
发布2024-04-11 15:00:56
820
发布2024-04-11 15:00:56
举报
文章被收录于专栏:生信菜鸟团生信菜鸟团

出发点在于最近的一个小需求。很多情况下用在线数据库的原因是很方便查看一个很小的点,例如生存分析例如共表达例如人群中多少突变比例;虽说这在成型的Figure1代码里也很容易,但是你要用数据去说服别人的时候,还是给别人一个在线数据库更让人信服。

随便一搜,就出来一个小细胞肺癌的芯片数据集,那就来看看这个

哦转换完ID之后发现这个数据集没有我想要的两个基因; 换一个,随便搜了几篇文章里所使用到的数据集: ①GSE6044和GSE40275 ②GSE43346和GSE6044 ③GSE6044和GSE11969 ④GSE40275,GSE1037,GSE44447 综合多个数据集的数据可以看到,DLL3在小细胞肺癌中高表达。

这里只是一个小小的印证猜想的简陋代码

代码语言:javascript
复制
rm(list = ls())
library(GEOquery)
eSet = getGEO("GSE40275", destdir = '.', getGPL = F)
class(eSet)
length(eSet)
eSet = eSet[[1]]
class(eSet)
exp <- exprs(eSet)
dim(exp) 
#是否需要log
range(exp)
#exp = log2(exp+1) #需要log才log
#boxplot(exp,las = 2)

pd <- pData(eSet)

p = identical(rownames(pd),colnames(exp));p
if(!p) {
  s = intersect(rownames(pd),colnames(exp))
  exp = exp[,s]
  pd = pd[s,]
}

gpl_number <- eSet@annotation;gpl_number


# 1.Group----
library(stringr)
library(tidyr)
library(tibble)
library(dplyr)

colnames(pd)
k = str_detect(pd$characteristics_ch1.1,"Small|normal");table(k)
pd = pd[k,]

exp = as.data.frame(exp)
exp = exp[,colnames(exp) %in% rownames(pd)]
p = identical(rownames(pd),colnames(exp));p

k = str_detect(pd$source_name_ch1,"normal");table(k)
Group = ifelse(k,"Normal","SCLC")
Group = factor(Group,levels = c("Normal","SCLC"))
Group


#2.探针注释的获取-----------------
library(tinyarray)
find_anno(gpl_number)

#1.加probe_id列,把行名变成一列
get_gpl_txt = function (gpl, destdir = getwd(), download = F) 
{
  if (!stringr::str_starts(gpl, "GPL|gpl")) 
    stop("wrong GPL accession")
  gpl = stringr::str_to_upper(gpl)
  url = paste0("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=", 
               gpl, "&targ=self&form=text&view=data")
  message(url)
  if (download) {
    download.file(url, destfile = paste0(gpl, ".txt"))
    message("If the download fails, check that your data is microarray data.")
  }
}

get_gpl_txt("GPL15974")
a1=read.delim("GPL15974.txt",comment.char = "#",skip = 1,header = F)
colnames(a1) = a1[2,]
b=a1[-c(1,2),]
library(clusterProfiler)
library(org.Hs.eg.db)
s2e = bitr(b$ENTREZ_GENE_ID, 
           fromType = "ENTREZID",
           toType = "SYMBOL",
           OrgDb = org.Hs.eg.db)

ids=merge(b,s2e,by.x="ENTREZ_GENE_ID",by.y="ENTREZID")
ids=ids[,c(2,8)]
colnames(ids)=c("probe_id","symbol")
library(dplyr)
exp = as.data.frame(exp)
dat = mutate(exp,probe_id = rownames(exp))
ids = distinct(ids,symbol,.keep_all = T)
dat = inner_join(dat,ids,by="probe_id")
exprSet2 = avereps(dat[,-c(59:60)],       
                   ID = dat$symbol)
exprSet2[1:5,1:5]
exprSet2 = as.data.frame(exprSet2)

c('DLL3','CD276') %in% rownames(exprSet2)
exp1 = exprSet2['DLL3',]
exp2 = exprSet2['CD276',]
# exp1 =  t(as.data.frame(exp1))  #后面很多代码还是用基因×样本的形式作为输入,所以在这里转置一下
# rownames(exp1) = 'SLC17A9'


###DLL3在正常组织和SCLC中的表达
dat = t(exp1) %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  mutate(Group)
dim(dat)

pdat = dat%>% 
  pivot_longer(cols =(!rowname)&(!Group),
               names_to = "gene",
               values_to = "count")

library(ggplot2)
library(reshape2)
library(ggbeeswarm) 
library(beeswarm)
library(ggsignif)

p <- ggplot(pdat, aes(Group, count)) +
  geom_beeswarm(cex = 1.0, show.legend = FALSE,aes(color = Group)) + #cex 可以通过点的尺寸定义宽度
  scale_color_manual(values = c('blue', 'red')) + #颜色自定义赋值
  theme(panel.grid = element_blank(), panel.background = element_blank(),
        axis.line = element_line(color = 'black')) + #去除默认的背景色、边框等
  labs(x = '', y = 'DLL3 expression level',title = "GSE40275")+
  theme(plot.title = element_text(hjust = 0.5, size = 16))


#添加中位数,上、下四分位数指示线
p1=p + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
                    geom = 'crossbar', width = 0.35, size = 0.5, color = 'black') +
  stat_summary(fun.data = function(x) median_hilow(x, 0.5),
               geom = 'errorbar', width = 0.2, color = 'black')


###自己设定对比
compaired <- list( c("Normal", "SCLC"))
###星号,还修改了上面的显著性括号的两侧小竖线长度为0
p2 = p1+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test =t.test,tip_length = 0)
p2

ggsave(filename = 'GSE40275nomol-tumor-boxplot.pdf',width = 9, height = 7)
dev.off()

save(p2,file = "GSE40275nomol_tumor.Rdata")


###CD276在正常组织和SCLC中的表达
dat = t(exp2) %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  mutate(Group)
dim(dat)
pdat = dat%>% 
  pivot_longer(cols =(!rowname)&(!Group),
               names_to = "gene",
               values_to = "count")
p <- ggplot(pdat, aes(Group, count)) +
  geom_beeswarm(cex = 1.0, show.legend = FALSE,aes(color = Group)) + #cex 可以通过点的尺寸定义宽度
  scale_color_manual(values = c('blue', 'red')) + #颜色自定义赋值
  theme(panel.grid = element_blank(), panel.background = element_blank(),
        axis.line = element_line(color = 'black')) + #去除默认的背景色、边框等
  labs(x = '', y = 'B7H3 expression level',title = "GSE43346")+
  theme(plot.title = element_text(hjust = 0.5, size = 16))

p1=p + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
                    geom = 'crossbar', width = 0.35, size = 0.5, color = 'black') +
  stat_summary(fun.data = function(x) median_hilow(x, 0.5),
               geom = 'errorbar', width = 0.2, color = 'black')

compaired <- list( c("Normal", "SCLC"))
p3 = p1+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test =t.test,tip_length = 0)
p3


colnames(pd)
k = str_detect(pd$source_name_ch1,"small");table(k)
pd_SCLC = pd[k,]

exprSet3 = exprSet2[,colnames(exprSet2) %in% rownames(pd_SCLC)]
p = identical(rownames(pd_SCLC),colnames(exprSet3));p


library("ggstatsplot")

c('DLL3','CD276','SCDO1') %in% rownames(exprSet3)

dat =as.data.frame( t(exprSet3)) 
cor(dat$DLL3,dat$CD276)
cor = ggscatterstats(data = dat, 
               y = DLL3, 
               x = CD276,
               centrality.para = "mean",
               margins = "both",
               xfill = "red", 
               yfill = "blue", 
               marginal.type = "histogram",
               title = "Relationship"
)
cor
ggsave(filename = 'GSE40275DLL3_B7H3_correlation.pdf',width = 8, height = 7)


p2+p3+cor
ggsave(filename = 'GSE40275.pdf',width = 14, height = 7)

下面三个数据集有两个基因的信息,在GSE40275中可以看到,DLL3和CD276两基因的表达正相关性挺高。诶这里B7H3?奇怪 这个数据出现问题,导致我怀疑这里的相关性准不准了。

这个数据会不会把离群点删掉结果更好?应该也不会,太散乱了

这个数据集GSE44447中就在小细胞肺癌中高表达了,虽然没有显著差异,数据量较少。

在小细胞肺癌中,B7-H3明显是高表达的;且病例分布广和表达比其他靶点高[1-3]。

(B) The negative expression of the B7-H3 protein in normal lung tissue. (C) The positive expression of the B7-H3 protein in adenocarcinoma of the lung.

并且这几个靶点明显没有共表达。

靶向 DLL3、B7H3 的多特异性抗体、CART 疗法和 ADC 药物在治疗晚期末线SCLC 上展现了良好潜力。对比临床数据,DS-7300(B7H3 ADC)vs HS-20093(B7H3 ADC)vs AMG-757(DLL3/CD3 双抗) vs Keytruda(PD-1):ORR(52.4% vs 77.8% vs 40% vs 35.7%),mPFS(5.6m vs NA vs 4.9m vs 2.1m)。 看起来靶向B7H3疗效很不错,虽然可能是叠加了ADC的疗效。(这里我也没有对比B7H3单抗

来源于与瘤伴舞公众号

还找了一个单细胞数据GSE164404来验证看看。

首先下载文件,读取进来

代码语言:javascript
复制
rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(dplyr)
library(hdf5r)
library(ggplot2)
library(clustree)
library(cowplot)
library(data.table)
dir='GSE164404_RAW//' 
samples=list.files( dir )
samples

sceList = lapply(samples,function(pro){ 
  print(pro) 
  sce =CreateSeuratObject(counts =  Read10X_h5( file.path(dir,pro)) ,
                          project =   gsub('_filtered_feature_bc_matrix.h5','',gsub('^GSM[0-9]*_','',pro) ) ,
                          min.cells = 5,
                          min.features = 300 )
  return(sce)
})
sceList
samples

sce.all=sceList[[1]]


as.data.frame(sce.all@assays$RNA@layers$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident) 
##才五千多个细胞呀

中间省略掉QC和降维聚类…

代码语言:javascript
复制
#######step4: 检查常见分群情况
###### 检查常见分群情况  ######
rm(list=ls())
library(ggplot2)
dir.create("3-cell")
setwd("3-cell")  
getwd()
sce.all.int=readRDS("../2-harmony/sce.all_int.rds")

#marker
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',  ## T 
                   'CD19', 'CD79A', 'MS4A1' ,  ## B 
                   'IGHG1', 'MZB1', 'SDC1', ## Plasma 
                   'CD68', 'CD163', 'CD14', 
                   'VCAN', 'FCN1', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'MKI67' , 'TOP2A','LYZ', ##myloid
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'CLEC10A', 'CLEC12A','LILRA4', ##pDC
                   'FGFBP2','NCR1','KLRK1','KLRC1','NCAM1','FCGR3A', # NK 
                   'KIT', ##MAST
                   'CSF3R','ITGAM','ITGB2','FUT4','FCGR3B','FCGR2A', ##Neutrophil
                   'FGF7','MME', 'ACTA2','VWF','COL1A1', ## human  fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19','KRT7' # epi 
)

sce.all = sce.all.int
library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip()

p 


###看到了觉得奇怪,又去看了眼数据信息
#CD45/CD31-阴性啊!

嘶,就是说,知识不用就,模糊了….

通常我们拿到了肿瘤相关的单细胞转录组的表达量矩阵后的第一层次降维聚类分群通常是:immune (CD45+,PTPRC),epithelial/cancer (EpCAM+,EPCAM),stromal (CD10+,MME,fibro or CD31+,PECAM1,endo)

所以这里就是应该只剩下上皮了

代码语言:javascript
复制
#ggsave('check_last_markers.pdf',height = 11,width = 11)
mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
             '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
             '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
             '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
             '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
             '#968175')

p_umap=DimPlot(sce.all, reduction = "umap",cols = mycolors,pt.size = 0.4,
               group.by = "RNA_snn_res.0.5",label = T,label.box = T) 
p_umap
library(patchwork)
p+p_umap
ggsave('markers_umap_2.pdf',width = 13,height = 8)



#####step5: 细胞生物学命名
sce.all@meta.data$celltype = 'epithelium'
table(sce.all@meta.data$celltype)


##看看表达状态
features = c('DLL3','CD276','CD274')
FeaturePlot(sce.all,features = features,ncol = 3 )
代码语言:javascript
复制
###相关性分析
exprSet <- sce.all@assays[["RNA"]]@layers$data
exprSet = as.data.frame(exprSet)
exprSet[1:6,1:6]
dim(sce.all)
dim(exprSet)
colnames(sce.all)
rownames(sce.all)
rownames(exprSet) = rownames(sce.all)

dat = exprSet[c('DLL3','CD276'),]
dat<-as.data.frame(t(dat)) #转置

cor_result <- cor(dat, method = "pearson") #method一般默认pearson

plot(dat[,"DLL3"], dat[,"CD276"],
     xlab = "DLL3 Expression",
     ylab = "CD276 Expression",
     main = "Scatter Plot")


ggscatter(dat, x = 'DLL3', y = "CD276",
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          add.params = list(color = "red", fill = "lightgray"),
          xlab = "DLL3 Expression", ylab = "CD276 Expression")     

看起来在小细胞肺癌里还是没有什么相关性的,跟在NSCLC中差不多。

参考文献

代码语言:javascript
复制
1.J Immunother Cancer. 2019 Mar 8;7(1):65.
2.Front Oncol.The Expression of Three Negative Co-Stimulatory B7 Family Molecules in Small Cell Lung Cancer and Their Effect on Prognosis. 2021 Apr 15:11:600238. 
3.Transl Oncol.B7-H3/CD276 and small-cell lung cancer: What's new? 2024 Jan:39:101801. 
本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2024-03-31,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
相关产品与服务
数据库
云数据库为企业提供了完善的关系型数据库、非关系型数据库、分析型数据库和数据库生态工具。您可以通过产品选择和组合搭建,轻松实现高可靠、高可用性、高性能等数据库需求。云数据库服务也可大幅减少您的运维工作量,更专注于业务发展,让企业一站式享受数据上云及分布式架构的技术红利!
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档