前面我们做了STAR基因组索引构建所需资源的评估,现在我们看下reads比对
对计算资源和时间的需求。
首先下载获得样品SRR1039517
的原始测序数据,数据量约为34 million
长度为63
个碱基的双端reads
, 总碱基数 4.3 G
左右。具体见NGS基础:测序原始数据批量下载。
fastq-dump -v --split-3 --gzip SRR1039517
seqkit stats SRR1039517_1.fastq.gz SRR1039517_1.fastq.gz
file format type num_seqs sum_len min_len avg_len max_len
SRR1039517_1.fastq.gz FASTQ DNA 34,298,260 2,160,790,380 63 63 63
SRR1039517_1.fastq.gz FASTQ DNA 34,298,260 2,160,790,380 63 63 63
把所有序列拆分为最多包含20万条reads
的小的原始测序数据文件。并重命名,去掉文件名前面填充的0
。主要是便于后面进行批量处理。
seqkit split2 -s 200000 -1 SRR1039517_1.fastq.gz -2 SRR1039517_2.fastq.gz
# 如果是ubuntu,可能需要使用rename.ul
rename '_00' '_' SRR1039517_1.fastq.gz.split/*
rename '_0' '_' SRR1039517_1.fastq.gz.split/*
拆分后,总共获得172
对测序数据。我们通过cat
命令对序列文件进行累加,模拟不同测序深度的样品,并进行比对,统计每次比对需要的计算资源和时间资源。这里使用了4
个线程对每个样品并行比对,不同样品是串行比对。
/bin/rm -f SRR1039517_1.part_1.fastq.gz SRR1039517_2.part_2.fastq.gz
for part in `seq 1 172`; do
cat SRR1039517_1.fastq.gz.split/SRR1039517_1.part_${part}.fastq.gz >>SRR1039517_1.part_1.fastq.gz
cat SRR1039517_1.fastq.gz.split/SRR1039517_2.part_${part}.fastq.gz >>SRR1039517_2.part_2.fastq.gz
i=SRR1039517
mkdir -p ${i}.${part}
mkdir -p tmp
/usr/bin/time -v -o star.${i}.${part}.log STAR --runMode alignReads --runThreadN 4 \
--readFilesIn ${i}_1.part_1.fastq.gz ${i}_2.part_2.fastq.gz \
--readFilesCommand zcat --genomeDir star_GRCh38 \
--outFileNamePrefix ${i}.${part}/${i}. --outFilterType BySJout --outSAMattributes NH HI AS NM MD \
--outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 \
--alignIntronMin 20 --alignIntronMax 1000000 \
--alignMatesGapMax 1000000 \
--outFilterMatchNminOverLread 0.66 --outFilterScoreMinOverLread 0.66 \
--winAnchorMultimapNmax 70 --seedSearchStartLmax 45 \
--outSAMattrIHstart 0 --outSAMstrandField intronMotif \
--genomeLoad LoadAndKeep \
--outTmpDir /tmp/${i}.${part}/ \
--outSAMtype BAM Unsorted --quantMode GeneCounts
done
重新整理一个通用的awk
脚本,命名为timeIntegrate2.awk
:
#!/usr/bin/awk -f
function to_minutes(time_str) {
a=split(time_str,array1,":");
minutes=0;
count=1;
for(i=a;i>=1;--i) {
minutes+=array1[count]*60^(i-2);
count+=1;
}
return minutes;
}
BEGIN{OFS="\t"; FS=": "; }
ARGIND==1{if(FNR==1) header=$0; else datasize=$0;}
ARGIND==2{
if(FNR==1 && outputHeader==1)
print "Time_cost\tMemory_cost\tnCPU", header;
if($1~/Elapsed/) {
time_cost=to_minutes($2);
} else if($1~/Maximum resident set size/){
memory_cost=$2/10^6;
} else if($1~/CPU/) {
cpu=($2+0)/100};
}
END{print time_cost, memory_cost, cpu, datasize}
具体整合代码
i=SRR1039517
/bin/rm -f ${i}_1.part_1.fastq.gz ${i}_2.part_2.fastq.gz GRCh38_39517_star_reads_map.summary
for part in `seq 1 172`; do
cat ${i}_1.fastq.gz.split/${i}_1.part_${part}.fastq.gz >>${i}_1.part_1.fastq.gz
cat ${i}_1.fastq.gz.split/${i}_2.part_${part}.fastq.gz >>${i}_2.part_2.fastq.gz
~/soft/seqkit stats -j 8 ${i}_1.part_1.fastq.gz ${i}_2.part_2.fastq.gz | sed 's/,//g' | \
awk 'BEGIN{OFS="\t"}{reads+=$4; bases+=$5}END{print "nReads\tnBases"; print reads/10^6, bases/10^9}' | \
awk -v outputHeader=${part} -f ./timeIntegrate2.awk - star.${i}.${part}.log \
>>GRCh38_39517_star_reads_map.summary
done
/bin/rm -f ${i}_1.part_1.fastq.gz ${i}_2.part_2.fastq.gz
paste <(for part in `seq 1 172`; do du -s ${i}.${part} | \
awk -v i=${part} '{if(i==1) print "outputSize"; print $1/10^6}'; done) \
GRCh38_39517_star_reads_map.summary >GRCh38_39517_star_reads_map.summary2
结果如下
outputSize (G) Time_cost (minutes) Memory_cost (Gb) nCPU nReads (million) nBases (Gb)
0.034332 0.28 9.9072 1.9 0.4 0.0252
0.067064 0.228 13.2356 2.28 0.8 0.0504
0.099504 0.340167 15.5853 1.98 1.2 0.0756
0.131672 0.326 17.3644 2.55 1.6 0.1008
0.164156 0.413 18.7912 2.43 2 0.126
0.196804 0.463167 19.969 2.52 2.4 0.1512
0.229112 0.4615 20.939 2.88 2.8 0.1764
0.261368 0.516667 21.7589 2.83 3.2 0.2016
0.294484 0.568 22.5243 2.89 3.6 0.2268
0.32744 0.586667 23.182 3.06 4 0.252
0.359588 0.601 23.7126 3.26 4.4 0.2772
0.391772 0.683333 24.1722 3.09 4.8 0.3024
0.424232 0.826167 24.595 2.98 5.2 0.3276
0.456388 0.854333 24.9512 3.28 5.6 0.3528
0.488488 0.917167 25.2751 3.29 6 0.378
0.520716 0.937 25.5713 3.33 6.4 0.4032
0.552924 0.999167 25.8344 3.32 6.8 0.4284
0.584712 1.0485 26.0817 3.37 7.2 0.4536
0.616944 1.0965 26.2919 3.39 7.6 0.4788
0.64944 1.135 26.512 3.45 8 0.504
0.681984 1.17517 26.6706 3.49 8.4 0.5292
0.714948 1.14683 26.8639 3.52 8.8 0.5544
0.748312 1.10883 27.0044 3.47 9.2 0.5796
0.780928 1.14 27.1747 3.55 9.6 0.6048
0.8138 1.26317 27.2956 3.5 10 0.63
0.846996 1.3 27.4216 3.54 10.4 0.6552
0.880352 1.486 27.5397 3.59 10.8 0.6804
0.91364 1.49983 27.6463 3.6 11.2 0.7056
0.946512 1.55083 27.7506 3.53 11.6 0.7308
0.979624 1.61967 27.8361 3.49 12 0.756
1.01286 1.62517 27.943 3.57 12.4 0.7812
1.04676 1.66767 28.0145 3.59 12.8 0.8064
1.08059 1.6755 28.1329 3.63 13.2 0.8316
1.11306 1.7145 28.1842 3.63 13.6 0.8568
1.14475 1.7735 28.2665 3.65 14 0.882
1.17665 1.89567 28.308 3.62 14.4 0.9072
1.20814 1.96067 28.3619 3.55 14.8 0.9324
1.23969 1.948 28.4374 3.67 15.2 0.9576
1.27152 2.00583 28.4658 3.69 15.6 0.9828
1.30336 2.05633 28.5352 3.65 16 1.008
1.33483 2.04983 28.5566 3.68 16.4 1.0332
1.36708 2.1245 28.6243 3.65 16.8 1.0584
1.39974 2.25417 28.6217 3.67 17.2 1.0836
1.43171 2.27817 28.7085 3.72 17.6 1.1088
1.46321 2.322 28.7688 3.71 18 1.134
1.49494 2.282 28.8025 3.71 18.4 1.1592
1.5268 2.3305 28.8338 3.72 18.8 1.1844
1.55854 2.36067 28.8627 3.73 19.2 1.2096
1.59012 2.412 28.89 3.68 19.6 1.2348
1.62192 2.43117 28.917 3.69 20 1.26
1.65361 2.49833 28.9436 3.68 20.4 1.2852
1.68481 2.56833 28.9686 3.7 20.8 1.3104
1.71662 2.58367 28.9926 3.74 21.2 1.3356
1.74859 2.64333 29.0157 3.75 21.6 1.3608
1.78063 2.72033 29.0376 3.71 22 1.386
1.81293 2.7445 29.0589 3.74 22.4 1.4112
1.84533 2.79183 29.0792 3.75 22.8 1.4364
1.87753 2.7765 29.0982 3.74 23.2 1.4616
1.91008 2.86767 29.1176 3.72 23.6 1.4868
1.94256 3.00517 29.1356 3.75 24 1.512
1.97522 3.06933 29.1535 3.72 24.4 1.5372
2.00784 3.06633 29.1702 3.72 24.8 1.5624
2.04029 3.083 29.1852 3.64 25.2 1.5876
2.07288 3.0285 29.2007 3.77 25.6 1.6128
2.1055 3.049 29.2164 3.78 26 1.638
2.13861 3.08583 29.2336 3.74 26.4 1.6632
2.1727 3.18417 29.2544 3.75 26.8 1.6884
2.20504 3.22867 29.2682 3.78 27.2 1.7136
2.23734 3.35617 29.2812 3.72 27.6 1.7388
2.26982 3.3635 29.2938 3.78 28 1.764
2.30174 3.38567 29.3051 3.8 28.4 1.7892
2.33385 3.45417 29.316 3.79 28.8 1.8144
2.36633 3.49933 29.3272 3.78 29.2 1.8396
2.3987 3.44333 29.3379 3.78 29.6 1.8648
2.43073 3.545 29.3478 3.81 30 1.89
2.46358 3.74117 29.3585 3.79 30.4 1.9152
2.49704 3.79167 29.3694 3.81 30.8 1.9404
2.52971 3.68567 29.3786 3.82 31.2 1.9656
2.56169 3.7315 29.3871 3.78 31.6 1.9908
2.59398 3.78933 29.3955 3.77 32 2.016
2.62646 3.748 29.4036 3.78 32.4 2.0412
2.65862 3.76967 29.4113 3.81 32.8 2.0664
2.69066 3.92767 29.4191 3.8 33.2 2.0916
2.72278 3.95983 29.4265 3.81 33.6 2.1168
2.75499 4.024 29.4338 3.8 34 2.142
2.78692 4.05083 29.4403 3.82 34.4 2.1672
2.81906 4.1095 29.447 3.81 34.8 2.1924
2.8514 4.18167 29.4532 3.78 35.2 2.2176
2.88368 4.09467 29.4598 3.81 35.6 2.2428
2.91615 4.25317 29.4661 3.78 36 2.268
2.94896 4.43683 29.4726 3.81 36.4 2.2932
2.98138 4.49333 29.4781 3.79 36.8 2.3184
3.01411 4.38067 29.4837 3.77 37.2 2.3436
3.04697 4.34867 29.4898 3.82 37.6 2.3688
3.0799 4.382 29.4955 3.83 38 2.394
3.11275 4.35467 29.5012 3.81 38.4 2.4192
3.14541 4.9245 29.5065 3.42 38.8 2.4444
3.17823 4.62317 29.5116 3.77 39.2 2.4696
3.21122 4.63733 29.5172 3.82 39.6 2.4948
3.24487 4.68467 29.5231 3.83 40 2.52
3.27845 4.711 29.5285 3.85 40.4 2.5452
3.31161 4.8395 29.5348 3.8 40.8 2.5704
3.3444 4.7865 29.5413 3.82 41.2 2.5956
3.37759 4.883 29.5497 3.81 41.6 2.6208
3.41041 5.14933 29.5569 3.83 42 2.646
3.44278 5.204 29.5618 3.85 42.4 2.6712
3.47552 5.124 29.5659 3.8 42.8 2.6964
3.5085 5.10783 29.5709 3.81 43.2 2.7216
3.54084 5.109 29.5752 3.84 43.6 2.7468
3.57301 5.12683 29.5787 3.78 44 2.772
3.60639 5.23267 29.5826 3.82 44.4 2.7972
3.63972 5.61417 29.5867 3.64 44.8 2.8224
3.67239 6.52033 29.5927 3.48 45.2 2.8476
3.70516 7.05183 29.5007 3.42 45.6 2.8728
3.73857 6.949 29.6024 3.48 46 2.898
3.77162 6.888 29.6069 3.45 46.4 2.9232
3.80419 5.52133 29.6101 3.87 46.8 2.9484
3.83614 5.8575 29.6128 3.84 47.2 2.9736
3.86866 5.91483 29.6156 3.85 47.6 2.9988
3.90138 5.78217 29.6197 3.8 48 3.024
3.93331 5.74883 29.6226 3.84 48.4 3.0492
3.96561 5.77317 29.6252 3.83 48.8 3.0744
3.99835 5.76267 29.6295 3.83 49.2 3.0996
4.03138 5.97367 29.6337 3.82 49.6 3.1248
4.06518 8.01017 29.3961 3.38 50 3.15
4.09969 8.138 29.6426 3.38 50.4 3.1752
4.13252 9.755 29.4349 3.26 50.8 3.2004
4.1656 6.49233 29.5448 3.63 51.2 3.2256
4.19947 8.876 29.4903 3.37 51.6 3.2508
4.23331 6.5035 29.6538 3.64 52 3.276
4.26617 6.48483 29.6557 3.84 52.4 3.3012
4.29916 6.68383 29.6583 3.84 52.8 3.3264
4.33211 6.51417 29.6607 3.84 53.2 3.3516
4.36514 6.37817 29.6629 3.86 53.6 3.3768
4.39864 6.481 29.6653 3.81 54 3.402
4.4327 6.43167 29.6689 3.84 54.4 3.4272
4.46627 8.351 29.6731 3.01 54.8 3.4524
4.49779 7.4865 29.6494 3.6 55.2 3.4776
4.52981 10.1942 29.4533 3.19 55.6 3.5028
4.56185 10.7782 29.2792 3.19 56 3.528
4.59322 9.7155 29.3352 3.38 56.4 3.5532
4.62515 9.30333 29.5208 3.41 56.8 3.5784
4.65749 7.293 29.6829 3.56 57.2 3.6036
4.68936 6.48533 29.6845 3.85 57.6 3.6288
4.72118 6.53733 29.686 3.84 58 3.654
4.75413 6.69733 29.6876 3.84 58.4 3.6792
4.78725 6.46033 29.6895 3.84 58.8 3.7044
4.81937 6.389 29.6908 3.81 59.2 3.7296
4.85105 6.33033 29.6923 3.84 59.6 3.7548
4.88335 9.42183 29.6938 2.62 60 3.78
4.91568 7.21483 29.4664 3.54 60.4 3.8052
4.94768 6.4205 29.6966 3.78 60.8 3.8304
4.97946 6.337 29.6978 3.85 61.2 3.8556
5.01139 7.06983 29.699 3.86 61.6 3.8808
5.04355 7.15017 29.7001 3.86 62 3.906
5.07542 7.215 29.7013 3.85 62.4 3.9312
5.10747 7.25917 29.7027 3.85 62.8 3.9564
5.13966 7.27783 29.704 3.86 63.2 3.9816
5.17208 7.3745 29.7054 3.83 63.6 4.0068
5.20458 7.33233 29.707 3.86 64 4.032
5.23759 7.41067 29.7087 3.85 64.4 4.0572
5.27022 7.46467 29.71 3.87 64.8 4.0824
5.30309 7.49667 29.7112 3.86 65.2 4.1076
5.33602 7.554 29.7125 3.86 65.6 4.1328
5.36921 7.5765 29.7138 3.88 66 4.158
5.4022 7.25533 29.7152 3.86 66.4 4.1832
5.43503 6.9645 29.7164 3.85 66.8 4.2084
5.468 6.99817 29.7176 3.87 67.2 4.2336
5.50081 7.01467 29.719 3.87 67.6 4.2588
5.53405 7.02883 29.7202 3.86 68 4.284
5.56759 7.152 29.7214 3.85 68.4 4.3092
5.58427 7.089 29.7221 3.85 68.5965 4.32158
50 Million
或3 Gb
时,比对时间与数据量近乎完美正相关library(ImageGP)
library(ggplot2)
library(patchwork)
p1 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nReads",
yvariable = "Time_cost", smooth_method = "auto",
x_label ="Sequencing reads (Million)", y_label = "Running time (minutes)") +
scale_x_continuous(breaks=seq(0,70, by=5)) +
scale_y_continuous(breaks=seq(1,12,length=12))
p2 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nBases",
yvariable = "Time_cost", smooth_method = "auto",
x_label ="Sequencing reads (Gb)", y_label = "Running time (minutes)") +
scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +
scale_y_continuous(breaks=seq(1,12,length=12))
p1 + p2
20 Milion
(1.2 G)时,STAR比对所需内存随测序量增加快速增加,从9.9 G
快速增到28 G
。30 G
以内。p1 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nReads",
yvariable = "Memory_cost", smooth_method = "auto",
x_label ="Sequencing reads (Million)", y_label = "Maximum physical memory required (Gb)") +
scale_x_continuous(breaks=seq(0,70, by=5)) +
scale_y_continuous(breaks=seq(9,30,length=22))
p2 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nBases",
yvariable = "Memory_cost", smooth_method = "auto",
x_label ="Sequencing reads (Gb)", y_label = "Maximum physical memory required (Gb)") +
scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +
scale_y_continuous(breaks=seq(9,30,length=22))
p1 + p2
从下面这张图可以看出,比对时间异常的样本,它们的CPU利用率也相应的低,但没有太明显规律性。推测这部分程序运行时可能受到了其它程序对硬盘读写的影响。
p1 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nReads",
yvariable = "nCPU", smooth_method = "auto",
x_label ="Sequencing reads (Million)", y_label = "Number of CPUs used") +
scale_x_continuous(breaks=seq(0,70, by=5)) +
scale_y_continuous(breaks=seq(1,4.5,by=0.5), limits=c(1.5,4))
p2 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nBases",
yvariable = "nCPU", smooth_method = "auto",
x_label ="Sequencing reads (Gb)", y_label = "Number of CPUs used") +
scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +
scale_y_continuous(breaks=seq(1,4.5,by=0.5),limits=c(1.5,4))
p1 + p2
p1 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nReads",
yvariable = "outputSize", smooth_method = "auto",
x_label ="Sequencing reads (Million)", y_label = "Disk space usages (Gb)") +
scale_x_continuous(breaks=seq(0,70, by=5)) +
scale_y_continuous(breaks=seq(0,6,by=0.5), limits=c(0,6))
p2 <- sp_scatterplot("GRCh38_39517_star_reads_map.summary2", melted = T, xvariable = "nBases",
yvariable = "outputSize", smooth_method = "auto",
x_label ="Sequencing reads (Gb)", y_label = "Disk space usages (Gb)") +
scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +
scale_y_continuous(breaks=seq(0,6,by=0.5), limits=c(0,6))
p1 + p2