前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >【孟德尔随机化】GTEx数据和gwas数据可以直接MR吗?

【孟德尔随机化】GTEx数据和gwas数据可以直接MR吗?

作者头像
生信菜鸟团
发布2024-04-18 19:55:34
920
发布2024-04-18 19:55:34
举报
文章被收录于专栏:生信菜鸟团生信菜鸟团

原始代码来自:【孟德尔随机化和共定位】文献分享:青光眼的致病基因和细胞类型

https://github.com/segrelabgenomics/ TwoSampleMR_pipeline

对应的实施过程在补充材料里~

存放数据的文件夹长这样:

加载包

代码语言:javascript
复制
if (T) {
  library("data.table")
  library("dplyr")
  library("tidyr")
  library("foreign")
  library("tibble")
  library("metafor")
  library("meta")
  library("survival")
  library("ggplot2")
  library("plyr")
  library("gridExtra")
  library("gtable")
  library("grid")
  library("tidyverse")
  library("stringr")
  library("coloc")
  library("devtools")
  library("glmnet")
  library("MendelianRandomization")
  library("TwoSampleMR")
  library(SNPlocs.Hsapiens.dbSNP155.GRCh38)
  library(MungeSumstats)
  }

提前准备参数

代码语言:javascript
复制
gwas_name = "POAG_EUR"
gene_ID = "chr22:37808163:37809949:clu_24295:ENSG00000100116.16" 
gene_symb = "GCAT"
tissue = "Artery_Coronary"
qtl_type = "sQTL"
pvalue_cutoff = 0.000005 
gwas_file = "../Code_MR/GCST90011766_buildGRCh37.tsv.gz" ## 这是结局数据
path_to_QTL = "../Code_MR/GTEx_Analysis_v8_sQTL/"
file_extension = ".v8.sqtl_signifpairs.txt.gz"


gwas <- fread(gwas_file,data.table = F)
view(head(gwas))

以上的参数可以根据自己感兴趣的基因及所在的组织来变化 ~

⬇更改列名,规整数据

代码语言:javascript
复制
setnames(gwas, old=c("beta", "p_value","variant_id"), new=c("Effect", "pval","SNP"))
gwas$OA <- chartr("a-zA-Z", "A-Za-z", gwas$other_allele)
gwas$EA <- gwas$effect_allele
head(gwas)
gwas <- gwas[!duplicated(gwas$SNP),]

dir.create("../data/")
write.csv(gwas, paste0("../data/",gene_symb,"_",tissue,"_",qtl_type,"_",strsplit(gwas_name," ","_"),"_gwas.csv"), row.names = F,  quote = F)
gc()


QTL_file <-fread(paste0(path_to_QTL,tissue,file_extension))
gc()

筛选基因对应的数据

代码语言:javascript
复制
split_variant <- data.frame(str_split(QTL_file_subsest$variant_id, "_", simplify = T)[,c(1:4)])
colnames(split_variant) <- c("chr","pos","OA","EA")

QTL_file_subsest <- data.frame(cbind(QTL_file_subsest,split_variant))
QTL_file_subsest$chrpos <- paste(QTL_file_subsest$chr, QTL_file_subsest$pos, sep=":")

gwas$chrpos <- paste(gwas$chr, gwas$pos, sep=":")
gwas_subset=subset(gwas, select = c(chrpos,SNP))

#Merge the QTL data with the GWAS summary statistics on Chr:pos
GWAS_QTL_merge <- data.frame(merge(QTL_file_subsest, gwas_subset, by = "chrpos", all.x = T))
gc()

set.seed(1010)

#Rename columns and save the file after removing duplicate SNPs
setnames(GWAS_QTL_merge, old=c("slope", "slope_se", "pval_nominal"), new=c("Effect", "StdErr", "Pval"))
GWAS_QTL_merge <- GWAS_QTL_merge[!duplicated(GWAS_QTL_merge$variant_id),] ##注意这个时候SNP是空的!!

#Load the saved file and filter according to a P-value threshold
GWAS_QTL_merge_subset <- subset(GWAS_QTL_merge, Pval< pvalue_cutoff)

save(GWAS_QTL_merge_subset,file = "../tmp_data/GWAS_QTL_merge_subset.Rdata")

转换RSID

代码语言:javascript
复制
setnames(GWAS_QTL_merge_subset, old=c("Effect","EA","OA","chr","pos","Pval"), new=c("BETA","A2","A1","CHR","POS","P"))
GWAS_QTL_subset <- GWAS_QTL_merge_subset[,c("BETA","A2","A1","CHR","POS","P")]
# 转换RSID
rsidat <- MungeSumstats::format_sumstats(path=GWAS_QTL_subset,
                                         ref_genome = "GRCh38",dbSNP=155,
                                         nThread = 2,
                                         return_data =T,
                                         frq_is_maf = TRUE) 
rsidat$BP <- as.character(rsidat$BP) ##这是个坑啊,必须把POS和BP的属性调整一致后续才能匹配上
GWAS_QTL_merge <- GWAS_QTL_merge_subset %>% filter(.,.$POS %in% rsidat$BP) %>% 
  mutate(snp=rsidat$SNP[match(.$POS,rsidat$BP)])
dir.create("../tmp_data/")
write.csv(GWAS_QTL_merge, paste0("../tmp_data/",gene_symb,"_",tissue,"_",qtl_type,"_",gwas_name,"_sig_pair_qtls_filtered.csv"), row.names = F,  quote = F)

读取暴露数据

做到这一步,不难发现,作者是直接将GTEx的SQTL数据作为暴露数据读入了。

代码语言:javascript
复制
options(warn = -1) 
gene_unclumped <- read_exposure_data(
  filename = paste0("../tmp_data/",gene_symb,"_",tissue,"_",qtl_type,"_",gwas_name,"_sig_pair_qtls_filtered.csv"),
  phenotype_col = "Phenotype",
  sep = ",",
  snp_col = "snp",
  chr_col = "CHR",
  pos_col = "POS",
  beta_col = "BETA",
  se_col = "StdErr",
  eaf_col = "maf",
  effect_allele_col = "A2", 
  other_allele_col = "A1",
  pval_col = "P")
# system(paste0("rm ",paste0("../tmp_data/",gene_symb,"_",tissue,"_",qtl_type,"_",gwas_name,"_sig_pair_qtls_filtered.csv")))
save(gene_unclumped,file = "gene_unclumped.Rdata")

load("gene_unclumped.Rdata")
# Clump at r2 = 0.1 (到这儿了0312)
gene_iv <- clump_data(
  gene_unclumped,
  clump_r2 = 0.1,
  pop = "EUR")

读取结局数据

代码语言:javascript
复制
gwas_outcome <- read_outcome_data(
  filename = paste0("../data/",gene_symb,"_",tissue,"_",qtl_type,"_",strsplit(gwas_name," ","_"),"_gwas.csv"),
  sep = ",",
  snps = gene_iv$SNP,
  snp_col = "SNP",
  chr_col = "chromosome",
  pos_col = "base_pair_location",
  beta_col = "Effect",
  se_col = "standard_error",
  effect_allele_col = "EA",
  other_allele_col = 'OA',
  pval_col = "pval")

MR

代码语言:javascript
复制
# Harmonise data
gene_gwas_harmonised <- harmonise_data(gene_iv, gwas_outcome)
header_num_variants <- as.numeric(length(unique(gene_gwas_harmonised$SNP)))

# MR analysis
gene_gwas_res <- mr(gene_gwas_harmonised, method_list=c("mr_ivw", "mr_simple_median", "mr_weighted_median", "mr_egger_regression"))

if(length(gene_gwas_res$method) == 0){
  gene_gwas_res <- mr(gene_gwas_harmonised)
}

计算置信区间

代码语言:javascript
复制
#Calculate 95% CIs
gene_gwas_res$b
if("beta" %in% tolower(colnames(gwas)) || "effect" %in% tolower(colnames(gwas))){
  print("Converting Beta to OR for IWD/Wald's test")
  gene_gwas_res$b = exp(gene_gwas_res$b)
}
gene_gwas_res$LCI <- gene_gwas_res$b-1.96*gene_gwas_res$se
gene_gwas_res$UCI <- gene_gwas_res$b+1.96*gene_gwas_res$se

敏感性分析

代码语言:javascript
复制
#MR PRESSO Global test of heterogeneity Pval 
print("Obtaining MR PRESSO Global test of heterogeneity Pval")
tryCatch({heterogeneity <- mr_heterogeneity(gene_gwas_harmonised)},
         error = function(e){
           print("MR PRESSO Global test of heterogeneity failed to run")
           heterogeneity <- "NA"})

#Perform MR-Egger Intercept test and report P-value
print("Performing MR-Egger Intercept test and reporting P-value")
tryCatch({pleiotropy_res <- mr_pleiotropy_test(gene_gwas_harmonised)},
         error = function(e){
           print("MR-Egger Test failed to run")
           pleiotropy_res <- "NA"})
if(nrow(pleiotropy_res) == 0){
  intercept <- "NA"
}else{
  print("MR-Egger Intercept Test completed")
  intercept = pleiotropy_res$pval
}

#Perform MR-PRESSO as an additional sensitivity analysis
print("Performing MR-PRESSO as an additional sensitivity analysis")
tryCatch({presso_res <- run_mr_presso(gene_gwas_harmonised)},
         error = function(e){
           print("MR-PRESSO Test failed to run")
         })
if(exists("presso_res")){
  print("MR-PRESSO Test completed")
}else{
  presso_res <- NA
}

make_result_df <- data.frame(matrix(NA, nrow = 1, ncol = 6))
colnames(make_result_df) <- c("GWAS (outcome)","Gene","Tissue","molQTL type","Intron excision cluster ID (start:end sites)","No. of variants")
row <- c(gwas_name,gene_symb,tissue,qtl_type, 
         ifelse(qtl_type == "sQTL",gene_ID,""),
         header_num_variants)
make_result_df <- data.frame(rbind(make_result_df,row))
make_result_df<- na.omit(make_result_df)


gene_gwas_res$method <- ifelse((gene_gwas_res$method) == "Inverse variance weighted" | 
                                 (gene_gwas_res$method) == "Wald ratio", 
                               "MR IVW/Wald ratio", (gene_gwas_res$method))
gene_gwas_res$method <- ifelse((gene_gwas_res$method) == "Simple median", 
                               "MR Simple Median",(gene_gwas_res$method))
gene_gwas_res$method <- ifelse((gene_gwas_res$method) == "Weighted median", 
                               "MR Weighted Median",(gene_gwas_res$method))
gene_gwas_res <- data.frame(gene_gwas_res[,c("method","b","se","pval","LCI","UCI")])

a <- unlist(gene_gwas_res)[c(1:nrow(gene_gwas_res))]
if(length(a) > 1){
  names <- paste0(rep(a,nrow(gene_gwas_res)), " ",names(unlist(gene_gwas_res)))
  names <- str_remove_all(names, "method[:digit:]")
  names <- str_replace_all(names, "b[:digit:]","Beta or OR")
  names <- str_replace_all(names, "se[:digit:]","SE")
  names <- str_replace_all(names, "pval[:digit:]","Pval")
  names <- str_replace_all(names, "LCI[:digit:]","or OR L 95% CI")
  names <- str_replace_all(names, "UCI[:digit:]","or OR U 95% CI")
  gene_gwas_res_unlist <- unlist(gene_gwas_res)
  names(gene_gwas_res_unlist) <- names
}else{
  names <- paste0(rep(a,nrow(gene_gwas_res)), " ",names(unlist(gene_gwas_res)))
  names <- str_remove_all(names, "method")
  names <- str_replace_all(names, "b","Beta or OR")
  names <- str_replace_all(names, "se","SE")
  names <- str_replace_all(names, "pval","Pval")
  names <- str_replace_all(names, "LCI","or OR L 95% CI")
  names <- str_replace_all(names, "UCI","or OR U 95% CI")
  gene_gwas_res_unlist <- unlist(gene_gwas_res)
  names(gene_gwas_res_unlist) <- names
}

gene_gwas_res_unlist <- gene_gwas_res_unlist[sort(names)]
make_result_df <- data.frame(cbind(make_result_df,t(gene_gwas_res_unlist)))
colnames(make_result_df) <- str_replace_all(colnames(make_result_df) , "\\.\\.","\\.")
colnames(make_result_df) <- str_replace_all(colnames(make_result_df) , "\\."," ")
colnames(make_result_df) <- trimws(colnames(make_result_df) , "right")
colnames(make_result_df)

make_result_df <- make_result_df %>% dplyr::select(any_of(c("GWAS outcome","Gene","Tissue","molQTL type",
                                                            "Intron excision cluster ID start end sites",
                                                            "No of variants",
                                                            "MR IVW Wald ratio Beta or OR",
                                                            "MR IVW Wald ratio or OR L 95 CI",
                                                            "MR IVW Wald ratio or OR U 95 CI",
                                                            "MR IVW Wald ratio Pval",
                                                            "MR Simple Median Beta or OR", 
                                                            "MR Simple Median SE", "MR Simple Median Pval",
                                                            "MR Weighted Median Beta or OR", 
                                                            "MR Weighted Median SE","MR Weighted Median Pval", 
                                                            "MR Egger Beta or OR", 
                                                            "MR Egger SE", "MR Egger Pval")))

ncol(make_result_df)
#The results file should have 19 columns
if(ncol(make_result_df) != 19){
  to_add <- 19 - ncol(make_result_df)
  df_to_add <- data.frame(matrix(NA,ncol = to_add, nrow = 1))
  make_result_df <- data.frame(cbind(make_result_df, df_to_add))
}
make_result_df$intercept <- intercept
if(is.na(presso_res) == FALSE){
  print(head(presso_res))
  presso_df <- data.frame()
  presso_df <- data.frame(rbind(presso_df,presso_res[[1]][["Main MR results"]]))
  presso_df <- presso_df[which(presso_df$`MR.Analysis` == "Outlier-corrected"),]
  outlier_pval <- str_remove_all(presso_res[[1]][["MR-PRESSO results"]][["Outlier Test"]]$Pvalue,"\\>")
  outlier_pval <- str_remove_all(presso_res[[1]][["MR-PRESSO results"]][["Outlier Test"]]$Pvalue,"\\<")
  outlier_pval <- as.numeric(outlier_pval)
  outlier_pval <- length(which(outlier_pval < 0.05))
  make_result_df$Global_test_heterogeneity_pval <- str_remove_all(presso_res[[1]][["MR-PRESSO results"]][["Global Test"]]$Pvalue,"\\>")
  make_result_df$Global_test_heterogeneity_pval <- str_remove_all(presso_res[[1]][["MR-PRESSO results"]][["Global Test"]]$Pvalue,"\\<")
  make_result_df$Global_test_heterogeneity_pval <- round(as.numeric(make_result_df$Global_test_heterogeneity_pval),3)
  
  if(outlier_pval > 0){
    make_result_df$outlier_estimate <- presso_df$`Causal.Estimate`
    make_result_df$outlier_estimate_sd <- presso_df$Sd
    make_result_df$outlier_estimate_pval <- presso_df$`P.value`
    make_result_df$n_outlier <- outlier_pval
  }else{
    make_result_df$outlier_estimate <- NA
    make_result_df$outlier_estimate_sd <- NA
    make_result_df$outlier_estimate_pval <- NA
    make_result_df$n_outlier <- NA
  }
}else{
  make_result_df$Global_test_heterogeneity_pval <- NA
  make_result_df$outlier_estimate <- NA
  make_result_df$outlier_estimate_sd <- NA
  make_result_df$outlier_estimate_pval <- NA
  make_result_df$n_outlier <- NA
}


if(header_num_variants > 4){
  if(is.na(heterogeneity)){
    print("MR PRESSO Global test of heterogeneity failed to run despite having minimum number of variants")}}
if(header_num_variants > 4 & is.na(make_result_df$n_outlier) == FALSE){
  if(is.na(unique(c(make_result_df[1,c(22:24)]))[[1]][1])){
    print("Outlier tests failed to run")
  }else{
    print("Outlier tests look good")}}

if(ncol(make_result_df) == 25){
  print("Results file successfully generated")
}else{
  print("Results file is missing columns")
}

print("Results File:")
print(paste0("../results/RESULTS_",tissue,"_",gene_symb,"_",gene_ID,"_",qtl_type,"_",str_replace(gwas_name," ","_"),"_",Sys.Date(),".txt"))

dir.create("../results/")
write.table(make_result_df,paste0("../results/RESULTS_",
                                  tissue,"_",gene_symb,"_",qtl_type,"_",
                                  gwas_name,"_",Sys.Date(),".txt"))

gc()
本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2024-04-16,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信菜鸟团 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 加载包
  • 提前准备参数
  • 筛选基因对应的数据
  • 转换RSID
  • 读取暴露数据
  • 读取结局数据
  • MR
  • 计算置信区间
  • 敏感性分析
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档