TCGAbiolinks数据下载TCGA数据
1if (!requireNamespace("BiocManager", quietly = TRUE)) 2 install.packages("BiocManager") 3if (!requireNamespace("BiocManager", quietly = TRUE)) 4 install.packages("TCGAbiolinks") 5library(TCGAbiolinks) 6library(DT) 7library(dplyr) 8library(SummarizedExperiment)
1query.met <- GDCquery(project = "TCGA-COAD", 2 data.category = "DNA Methylation", 3 legacy = FALSE, 4 platform = c("Illumina Human Methylation 450")) 5query.exp <- GDCquery(project = "TCGA-COAD", 6 data.category = "Transcriptome Profiling", 7 data.type = "Gene Expression Quantification", 8 workflow.type = "HTSeq - FPKM-UQ") 9query.exp[1:5,1:5] 10query.met[1:5,1:5] 11datatable(getResults(query.exp, cols = c("data_type","cases")), 12 filter = 'top', 13 options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 14 rownames = FALSE)
1query <- GDCquery(project = "TCGA-BRCA",# Cancer type 2 data.category = "Transcriptome Profiling", 3 data.type = "Gene Expression Quantification", 4 workflow.type = "HTSeq - Counts"# raw count 5 ) 6GDCdownload(query, 7 directory = "./project",# 8 method = "api", 9 files.per.chunk = 100)# 10data <- GDCprepare(query) 11count_data=assay(data) 12count_data[1:5,1:5] 13dim(count_data)##56537 1222 14 15## clinical information 16colData(data)[1:5,1:5] 17 18## save data 19#save(count_data,file = "BRCA_count.Rdata")
1clinical <- GDCquery_clinic(project = "TCGA-BRCA", type = "clinical") 2clinical[1:5,1:5] 3dim(clinical)## 1097 68 4 5## save 6#save(clinical,file="BRCA_clinical.Rdata") 7#write.csv(clinical,file="TCGAbiolinks-BRCA-clinical.csv") 8 9## clinical-2 10clinical_2<-colData(data) 11#write.csv(clinical,file="TCGAbiolinks-BRCA-clinical.csv") 12 13## 获取所有TCGA的临床信息 14library(data.table) 15library(dplyr) 16library(regexPipes) 17clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>% 18 regexPipes::grep("TCGA",value=T) %>% ## TCGA 19 sort %>% 20 plyr::alply(1,GDCquery_clinic, .progress = "text") %>% 21 rbindlist 22dim(clinical) 23clinical[1:5,1:5] 24#readr::write_csv(clinical,path = paste0("BRCA_clin_indexed.csv"))
1query <- GDCquery(project = "TCGA-BRCA", 2 data.category = "Transcriptome Profiling", 3 data.type = "miRNA Expression Quantification", 4 workflow.type = "BCGSC miRNA Profiling") 5## 检束结果 6results<-getResults(query) 7dim(results) 8results[1:5,1:5] 9colnames(results) 10 11## download data 12GDCdownload(query, 13 method = "api", 14 files.per.chunk = 20)# 减少下载失败风险 15mir_exp<- GDCprepare(query = query, 16 summarizedExperiment=F)# set F 17 18## 数据结构 19dim(mir_exp) 20mir_exp[1:5,1:5] 21 22## save data 23#save(mir_exp,file="BRCA_miRNA_raw.Rdata")
除了下载功能以外,TCGAbiolinks还包括了一些数据分析挖掘功能:
笔者认为,这些功能的好处在于能够比较方便的用简单几行代码做一些固定的分析,恰恰这也是它的缺点,这样的话包的灵活性就大打折扣了,不能够很好的整合用户的其它分析流程。考虑到这一点,我可能不会过多的使用它的其它分析功能
原文链接:http://www.sci666.com.cn/17544.html
我来说两句