备注:docker运行的操作系统,推荐为Linux,windows,macOS系统改下docker可能部分功能(网络)不能正常运行
# 拉取docker镜像
docker pull doujiangbaozi/sliverworkspace:latest
# 查看docker 镜像
docker images
version: "3"
services:
SarsCov2:
image: doujiangbaozi/sliverworkspace:latest
container_name: Germline
volumes:
- /media/sliver/Data/data:/opt/data:rw #挂载原始数据,放SC2目录下
- /media/sliver/Manufacture/GL2/envs:/root/mambaforge-pypy3/envs:rw #挂载envs conda环境目录
- /media/sliver/Manufacture/GL2/config:/opt/config:rw #挂载config conda配置文件目录
- /media/sliver/Manufacture/GL2/ref:/opt/ref:rw #挂载reference目录
- /media/sliver/Manufacture/GL2/result:/opt/result:rw #挂载中间文件和输出结果目录
ports:
- "9024:9024" #ssh连接端口可以按需修改
environment:
- TZ=Asia/Shanghai #设置时区
- PS=20191124 #修改默认ssh密码
- PT=2024 #修改默认ssh连接端口
基础环境运行
# docker-compose.yml 所在目录下运行
docker-compose up -d
# 或者
docker-compose up -d -f /路径/docker-compose.yaml
# 查看docker是否正常运行,docker-compose.yaml目录下运行
docker-compose ps
# 或者
docker ps
docker 容器使用,类似于登录远程服务器
# 登录docker,使用的是ssh服务,可以本地或者远程部署使用
ssh root@192.168.6.6 -p9024
# 看到如下,显示如下提示即正常登录
(base) root@SliverWorkstation:~#
#样本编号
export sn=SRR9993255
#数据输入目录
export data=/opt/data
#数据输出、中间文件目录
export result=/opt/result
#conda安装的环境目录
export envs=/root/mambaforge-pypy3/envs
#vep cache 版本
export vep_version=108
#设置可用线程数
export threads=8
#与耳聋相关基因及突变文件
export whitelist=/opt/ref/whitelist.csv
#conda检测环境是否存在,首次运行不存在创建该环境并安装软件
if [ ! -d "${envs}/fastp" ]; then
mamba env create -f /opt/config/fastp.yaml
fi
source activate fastp
mkdir -p ${result}/${sn}/clean
mkdir -p ${result}/${sn}/qc
#使用默认参数
fastp \
-w ${threads} \
-i ${data}/Germline/${sn}_R1.fastq.gz \
-I ${data}/Germline/${sn}_R2.fastq.gz \
-o ${result}/${sn}/clean/${sn}_fastp_R1.fastq.gz \
-O ${result}/${sn}/clean/${sn}_fastp_R2.fastq.gz \
-j ${result}/${sn}/qc/${sn}_fastp.json \
-h ${result}/${sn}/qc/${sn}_fastp.html
conda deactivate
#conda检测环境是否存在,首次运行不存在创建该环境并安装软件
if [ ! -d "${envs}/align" ]; then
mamba env create -f /opt/config/align.yaml
fi
source activate align
#参考基因组文件如果不存在,去broadinstitute下载
if [ ! -f "/opt/ref/hg38/hg38.fasta" ]; then
mkdir -p /opt/ref/hg38
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Homo_sapiens_assembly38.fasta.gz -d /opt/ref/hg38 -o hg38.fasta.gz
cd /opt/ref/hg38 && gzip -d /opt/ref/hg38/hg38.fasta.gz
#如果索引文件不存在,创建索引
if [ ! -f /opt/ref/hg38.fasta.amb ] ||
[ ! -f /opt/ref/hg38/hg38.fasta.ann ] ||
[ ! -f /opt/ref/hg38/hg38.fasta.bwt ] ||
[ ! -f /opt/ref/hg38/hg38.fasta.pac ] ||
[ ! -f /opt/ref/hg38/hg38.fasta.sa ]; then
bwa index /opt/ref/hg38/hg38.fasta
fi
fi
#如果samtools索引不存在,创建
if [ ! -f "/opt/ref/hg38/hg38.fasta.fai" ]; then
samtools faidx /opt/ref/hg38/hg38.fasta
fi
cd ${result}
mkdir -p ${result}/${sn}/align
#数据比对、排序
bwa mem \
-t ${threads} -M \
-R "@RG\\tID:${sn}\\tLB:${sn}\\tPL:Illumina\\tPU:NextSeq550\\tSM:${sn}" \
/opt/ref/hg38/hg38.fasta \
${result}/${sn}/clean/${sn}_fastp_R1.fastq.gz \
${result}/${sn}/clean/${sn}_fastp_R2.fastq.gz \
| sambamba view -S -f bam -l 0 /dev/stdin \
| sambamba sort -t ${threads} -m 2G \
--tmpdir=${result}/${sn} \
-o ${result}/${sn}/align/${sn}_sorted.bam /dev/stdin
ulimit -n 10240
#标记重复
sambamba markdup \
--tmpdir ${result}/${sn} \
-t ${threads} \
${result}/${sn}/align/${sn}_sorted.bam \
${result}/${sn}/align/${sn}_marked.bam
mv ${result}/${sn}/align/${sn}_marked.bam.bai ${result}/${sn}/align/${sn}_marked.bai
rm -f ${result}/${sn}/align/${sn}_sorted.bam
#数据文件没有附带bed文件,这里用bedtools反向计算一个
bedtools genomecov -ibam ${result}/${sn}/align/${sn}_marked.bam -bg > ${result}/${sn}/align/${sn}_bedtools.bed
bedtools merge -i ${result}/${sn}/align/${sn}_bedtools.bed > ${result}/${sn}/align/${sn}_merged.bed
samtools flagstat --threads ${threads} ${result}/${sn}/align/${sn}_marked.bam > ${result}/${sn}/qc/${sn}_marked.flagstat
conda deactivate
if [ ! -f "${envs}/bamdst/bamdst" ]; then
apt-get update && apt-get install -y git make gcc zlib1g-dev
git clone https://github.com/shiquan/bamdst.git "${envs}/bamdst"
cd ${envs}/bamdst
make
cd ${result}
fi
${envs}/bamdst/bamdst -p ${result}/${sn}/align/${sn}_merged.bed -o ${result}/${sn}/qc --cutoffdepth 20 ${sn}/align/${sn}_marked.bam
#conda检测环境是否存在,首次运行不存在创建该环境并安装软件
if [ ! -f "${envs}/gatk/bin/gatk" ]; then
mkdir -p ${envs}/gatk/bin
#替代下载地址
#https://github.com/broadinstitute/gatk/releases/download/4.3.0.0/gatk-4.3.0.0.zip
aria2c https://download.yzuu.cf/broadinstitute/gatk/releases/download/4.3.0.0/gatk-4.3.0.0.zip -d ${envs}/gatk/bin -o gatk.zip
apt-get install -y unzip
cd ${envs}/gatk/bin
unzip -o gatk.zip
mv ${envs}/gatk/bin/gatk-4.3.0.0/* ${envs}/gatk/bin/
rm -rf ${envs}/gatk/bin/gatk-4.3.0.0
#chmod +x ${envs}/bin/gatk
cd ${result}
fi
if [ ! -f "/opt/ref/hg38/dbsnp_146.hg38.vcf.gz" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/dbsnp_146.hg38.vcf.gz -d /opt/ref/hg38
fi
if [ ! -f "/opt/ref/hg38/dbsnp_146.hg38.vcf.gz.tbi" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/dbsnp_146.hg38.vcf.gz.tbi -d /opt/ref/hg38
fi
if [ ! -f "/opt/ref/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -d /opt/ref/hg38
fi
if [ ! -f "/opt/ref/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi -d /opt/ref/hg38
fi
if [ ! -f "/opt/ref/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz -d /opt/ref/hg38
fi
if [ ! -f "/opt/ref/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" ]; then
aria2c ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi -d /opt/ref/hg38
fi
source activate gatk
if [ ! -x "$(command -v python)" ]; then
mamba env create -f ${envs}/gatk/bin/gatkcondaenv.yml
fi
if [ ! -x "$(command -v java)" ]; then
mamba install -y openjdk=8.0.332
fi
if [ ! -f "/opt/ref/hg38/hg38.dict" ]; then
gatk CreateSequenceDictionary -R /opt/ref/hg38/hg38.fasta -O /opt/ref/hg38/hg38.dict
fi
if [ ! -f "/opt/ref/hg38.exon.interval_list" ]; then
gatk BedToIntervalList \
-I /opt/ref/hg38.exon.bed \
-SD /opt/ref/hg38/hg38.dict \
-O /opt/ref/hg38.exon.interval_list
fi
gatk BaseRecalibrator \
--known-sites /opt/ref/hg38/dbsnp_146.hg38.vcf.gz \
--known-sites /opt/ref/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
--known-sites /opt/ref/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
-L /opt/ref/hg38.exon.interval_list \
-R /opt/ref/hg38/hg38.fasta \
-I ${result}/${sn}/align/${sn}_marked.bam \
-O ${result}/${sn}/align/${sn}_recal.table
conda deactivate
source activate gatk
gatk ApplyBQSR \
--bqsr-recal-file ${result}/${sn}/align/${sn}_recal.table \
-L /opt/ref/hg38.exon.interval_list \
-R /opt/ref/hg38/hg38.fasta \
-I ${result}/${sn}/align/${sn}_marked.bam \
-O ${result}/${sn}/align/${sn}_bqsr.bam &
gatk CollectInsertSizeMetrics \
-I ${result}/${sn}/align/${sn}_marked.bam \
-O ${result}/${sn}/qc/${sn}_insertsize_metrics.txt \
-H ${result}/${sn}/qc/${sn}_insertsize_histogram.pdf &
wait
conda deactivate
source activate gatk
mkdir -p ${result}/${sn}/vcf
gatk HaplotypeCaller \
-R /opt/ref/hg38/hg38.fasta \
-L /opt/ref/hg38.exon.interval_list \
-I ${result}/${sn}/align/${sn}_bqsr.bam \
-D /opt/ref/hg38/dbsnp_146.hg38.vcf.gz \
-O ${result}/${sn}/vcf/${sn}.vcf
conda deactivate
source activate gatk
#https://gatk.broadinstitute.org/hc/en-us/articles/360035532412?id=11097
gatk VariantFiltration \
-R /opt/ref/hg38/hg38.fasta \
-V ${result}/${sn}/vcf/${sn}.vcf \
-O ${result}/${sn}/vcf/${sn}_snp.vcf \
--filter-name "SNP_DQ" \
--filter-expression "DQ < 2.0" \
--filter-name "SNP_MQ" \
--filter-expression "MQ <40.0" \
--filter-name "SNP_FS" \
--filter-expression "FS > 60.0" \
--filter-name "SNP_SOR" \
--filter-expression "SOR > 3.0" \
--filter-name "SNP_MQRankSum" \
--filter-expression "MQRankSum < -12.5" \
--filter-name "SNP_ReadPosRankSum" \
--filter-expression "ReadPosRankSum < -8.0"
gatk VariantFiltration \
-R /opt/ref/hg38/hg38.fasta \
-V ${result}/${sn}/vcf/${sn}_snp.vcf \
-O ${result}/${sn}/vcf/${sn}_filtered.vcf \
--filter-name "INDEL_DQ" \
--filter-expression "DQ < 2.0" \
--filter-name "INDEL_FS" \
--filter-expression "FS > 200.0" \
--filter-name "INDEL_SOR" \
--filter-expression "SOR > 10.0" \
--filter-name "INDEL_ReadPosRankSum" \
--filter-expression "ReadPosRankSum < -20.0" \
--filter-name "INDEL_InbreedingCoeff" \
--filter-expression "InbreedingCoeff < -0.8"
conda deactivate
#conda检测环境是否存在,首次运行不存在创建该环境并安装软件
if [ ! -d "${envs}/vep" ]; then
mamba env create -f /opt/config/vep.yaml
fi
mkdir -p /opt/result/${sn}/vcf
if [ ! -d "/opt/ref/vep-cache/homo_sapiens/${vep_version}_GRCh38" ]; then
aria2c https://ftp.ensembl.org/pub/release-${vep_version}/variation/indexed_vep_cache/homo_sapiens_vep_${vep_version}_GRCh38.tar.gz -d /opt/ref/
tar -zxvf /opt/ref/homo_sapiens_vep_${vep_version}_GRCh38.tar.gz -C /opt/ref/vep-cache/
fi
source activate vep
vep \
-i /opt/result/${sn}/vcf/${sn}_filtered.vcf \
-o /opt/result/${sn}/vcf/${sn}_filtered_vep.xls \
--offline \
--cache \
--cache_version ${vep_version} \
--everything \
--dir_cache /opt/ref/vep-cache \
--dir_plugins /opt/ref/vep-cache/Plugins \
--species homo_sapiens \
--assembly GRCh38 \
--hgvs \
--fasta /opt/ref/hg38/hg38.fasta \
--force_overwrite \
--format vcf \
--tab \
--fork 8 \
--offline
conda deactivate
#需借用gatk环境中的python来运行
source activate gatk
python ${envs}/GermlineVepAnnotationUtil.py \
--whitelist=${whitelist} \
-v ${result}/${sn}/vcf/${sn}_filtered.vcf \
-a ${result}/${sn}/vcf/${sn}_filtered_vep.xls \
-o ${result}/${sn}/${sn}.result.variants.xls
conda deactivate
source activate gatk
python ${envs}/GermlineQcUtil.py \
--out=/opt/result/${sn}/${sn}.result.qc.xls \
--sample-fastp=/opt/result/${sn}/qc/${sn}_fastp.json \
--sample-bamdst=/opt/result/${sn}/qc/coverage.report \
--sample-insertsize=/opt/result/${sn}/qc/${sn}_insertsize_metrics.txt
conda deactivate
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。