上机测序完成之后得到的测序数据为FASTQ文件
# 进入到个人目录
cd ~
## 1.建立数据库目录:在数据库下建立参考基因组数据库,注意命名习惯:参考基因组版本信息
mkdir -p database/GRCh38.105
## 2.建立项目分析目录
mkdir project
cd project
mkdir Human-16-Asthma-Trans # 注意项目命名习惯:物种-样本数-疾病-分析流程
cd Human-16-Asthma-Trans
# 建立数据存放目录
ls
# 建立比对目录
mkdir -p Mapping/Hisat2 Mapping/Subjunc
# 建立定量目录
mkdir -p Expression/featureCounts Expression/Salmon
# 查看整个分析目录准备结构
tree
├── data
│ ├── cleandata
│ ├── trim_galore
│ └── fastp
│ └── rawdata
├── Expression
│ ├── featureCounts
│ └── Salmon
└── Mapping
├── Hisat2
└── Subjunc
# 连接数据到自己的文件夹
cd $HOME/project/Human-16-Asthma-Trans/data/rawdata
ln -s /home/t_rna/data/airway/fastq_raw25000/*gz ./
(例如Q=70 用70-33=37,对应图1 Q=37已经满足了 )
#题1:
方法1:
(rna) Mar402 09:16:20 ~/project/Human-16-Asthma-Trans/data/rawdata
$ ls
SRR1039510_1.fastq.gz SRR1039511_1.fastq.gz SRR1039512_1.fastq.gz
SRR1039510_2.fastq.gz SRR1039511_2.fastq.gz SRR1039512_2.fastq.gz
(rna) Mar402 09:16:24 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz | wc -l
100000
(rna) Mar402 09:17:46 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz | wc -l | awk '{print $0/4}'
25000
# 方法2:
(rna) Mar402 09:26:29 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |grep -c '^@SRR'
25000
#题2 方法1:
(rna) Mar402 10:38:24 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |grep '^@SRR'| less -S
方法2:
(rna) Mar402 10:38:24 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | less -S #一行分成四列
(rna) Mar402 10:46:52 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 1 | less -s
# 题三
(rna) Mar402 10:47:02 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | head
TGGGAGGCTGAGGCAGGAGAATCACTTAAACCTGGGAGGCAGAGGTTACAGTGAGCCGAGATT
AAAGAAGGCGACAGTGAGAAGGAGTCCGAGAAGAGTGATGGAGACCCAATAGTCGATCCTGAG
CTGCTGGGCCCCAAGGTCCTCCTGGTCCCAGTGGTGAAGAAGGAAAGAGAGGCCCTAATGGGG
CTTGGCTGCAGCCATCCCGCTTAGCCTGCCTCACCCACACCCGTGTGGTACCTTCAGCCCTGG
TGAGACAGGTAATTCAGTATAGTAGATTAATATTTTTAATATATATTTTCCCTTAAGATTTCC
ATTTCTCAGTGTAGAAATCATGTCTTCTTAATTGCTGAACCTTACTGCAAAAACTTGTGATGT
ATCAAGAATACCAAAACAGTTTCCTAATATACAGTATTTGAAAGTGCTTGCCATATTGGCTCT
CTCATTTTCATCTTCACCATCAACAGAGAGAGCAGCATACTTGCTTGCAGAACTGAACTTAGA
TCCAACCGCAGCTTGGCATCTTCGGTGGCCTGCAGCTCGTCCTCCAGCTCTTCCAGCTGCGTC
CGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCGCTCTGCGAGGTACTTTTTCTA
(rna) Mar402 10:51:03 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | sort |less -s
AAAAAAAAAAAAAAAAAAAAAAAAAAAGCACCTTCCCCTAAAGGGGGGATTAAAACCCAAAAA
AAAAAAAAAAAAAAAAAAAAATTTGGGTTTAAAAAAAAAAAATTTCCCCCAAAAAAGGGGGGG
AAAAAAAAAAAAAAAAAATATTTTTTAACCAACAAAAAAAACTCGGGAATCCCCCCCACAAAA
AAAAAAAAAAAAAGGAACAAAAAAGTTTGGGGGAAAAATAAAAGAAACAAAAACAAAAAAAAA
AAAAAAAAAAAAATCATTTCAGGCCAGGGATAGGGCCTCACCCCCCTAAACCCAGCACTTTGA
AAAAAAAAAAAAGCCTCAGCATTTTATCATTCCATGGAAGGAGAATCTTTTGAAAAAAAGCAT
AAAAAAAAAAAAGTGGAAGTATTTTCCTGCTTAGAATGATTTCTGTTCCCCTTTGAATGTAAG
AAAAAAAAAAAGATGCAGGACTCCTTCAGTTCTTCACTAGTCTTAGAAAAACTTTCCAGAATA
AAAAAAAAAGTCTACCAAAGGAATTTGCATCCAGCAGCAGCACTTAGACCTGCCAGCCACTGT
AAAAAAAAGACACTAACGGCCAGTGAGTTGGAGTCTCAGGGCAGGGTGGCAGTTTCCCTTGAG
AAAAAAACAAACGTTTAGCCATTTGCAAACAAATTGTCTCTTTGCAATTGTCTAATATATGCA
AAAAAAAGGGAAGAGAGTATAAAGAAGTGTCCAGATTGGCTGAAATAGCATCCCAAAGAAGAG
AAAAAAAGTTGCTTCCATATGGTTACATTGTTAATTCCATAATCGCATTTACAATATCATTAC
AAAAAAAGTTTTAAACTCAAGCAGGCCAAAACCAATATGCTTATAAGAAATAATGAAAAGTTC
(rna) Mar402 10:51:33 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | sort |uniq -c |less -s # 重复行进行统计
(rna) Mar402 10:53:08 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | sort |uniq -c | sort -k1 -n -r |less -S #重复行统计并从大到小排序
#第四题
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | wc -c
1600000
(rna) Mar402 19:44:22 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | wc -m #wc -c (统计字节数)wc-m(统计字符数,推荐用-m)
1600000
(rna) Mar402 19:44:35 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 |head |cat -A #与正确答案不同,因为每一行后面隐藏着换行符
TGGGAGGCTGAGGCAGGAGAATCACTTAAACCTGGGAGGCAGAGGTTACAGTGAGCCGAGATT$
AAAGAAGGCGACAGTGAGAAGGAGTCCGAGAAGAGTGATGGAGACCCAATAGTCGATCCTGAG$
CTGCTGGGCCCCAAGGTCCTCCTGGTCCCAGTGGTGAAGAAGGAAAGAGAGGCCCTAATGGGG$
CTTGGCTGCAGCCATCCCGCTTAGCCTGCCTCACCCACACCCGTGTGGTACCTTCAGCCCTGG$
TGAGACAGGTAATTCAGTATAGTAGATTAATATTTTTAATATATATTTTCCCTTAAGATTTCC$
ATTTCTCAGTGTAGAAATCATGTCTTCTTAATTGCTGAACCTTACTGCAAAAACTTGTGATGT$
ATCAAGAATACCAAAACAGTTTCCTAATATACAGTATTTGAAAGTGCTTGCCATATTGGCTCT$
CTCATTTTCATCTTCACCATCAACAGAGAGAGCAGCATACTTGCTTGCAGAACTGAACTTAGA$
TCCAACCGCAGCTTGGCATCTTCGGTGGCCTGCAGCTCGTCCTCCAGCTCTTCCAGCTGCGTC$
CGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCGCTCTGCGAGGTACTTTTTCTA$
(rna) Mar402 19:44:59 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | tr -d '\n' | less -SN #此时没有换行符了,是一行内容
(rna) Mar402 19:49:34 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_1.fastq.gz |paste - - - - | cut -f 2 | tr -d '\n' |wc -m #此时再进行字符统计
1575000
#题5
read1和read2的reads数是一样的,所以SRRSRR1039510的reads数是25000对reads(50000个reads)
(rna) Mar402 19:49:42 ~/project/Human-16-Asthma-Trans/data/rawdata
$ zless SRR1039510_2.fastq.gz |paste - - - - | cut -f 2 | tr -d '\n' |wc -m #read1和read2的碱基数是一样的
1575000
----来自生信技能树----
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。