我几乎可以让我的needleman实现工作,但我对如何处理特定案例的跟踪感到困惑。
其思想是,为了重新构造序列(最长路径),我们重新计算,以确定矩阵的分数来源。我遇到的问题是,当右下角分数不在匹配矩阵中,而是在insert列矩阵中时(这意味着所得到的跟踪返回序列应该有一个insert )。
这些序列以a2m格式记录,其中插入序列被记录为小写字符。因此,在最后的输出中,ZZ与AAAC的对齐应该是AAac。当我用手追溯时,我得到了AAAc,因为我只访问了一次Ic矩阵。Here是我的白板图片。正如你所看到的,我有三个黑色箭头和一个绿色箭头,这就是为什么我的回溯给我AAAc。我应该数第一个单元格,然后停在位置1,1?我不知道我将如何改变我实现这一目标的方式。
注意,这里使用的替换矩阵是BLOSUM62。递推关系是
M(i,j) = max(Ic(i-1,j-1)+subst, M(i-1,j-1)+subst, Ir(i-1,j-1)+subst)
Ic(i,j) = max(Ic(i,j-1)-extend, M(i,j-1)-open, Ir(i,j-1)-double)
Ir(i,j) = max(Ic(i-1,j)-double, M(i-1,j)-open, Ir(i-1,j)-extend)编辑:这里是重新编写的traceback_col_seq函数,以便更干净。注意,score_cell现在返回的是thisM、thisC、thisR,而不是它们的最大值。这个版本与AaAc一样,仍然有着同样的问题,现在又有了另一个问题,为什么它会在1,2重新进入Ic。然而,这个版本更容易读懂。
def traceback_col_seq(self):
i, j = self.maxI-1, self.maxJ-1
self.traceback = list()
matrixDict = {0:'M',1:'Ic',2:'Ir',3:'M',4:'Ic',5:'Ir',6:'M',7:'Ic',8:'Ir'}
while i > 0 or j > 0:
chars = self.col_seq[j-1] + self.row_seq[i-1]
thisM, thisC, thisR = self.score_cell(i, j, chars)
cell = thisM + thisC + thisR
prevMatrix = matrixDict[cell.index(max(cell))]
print(cell, prevMatrix,i,j)
if prevMatrix == 'M':
i -= 1; j -= 1
self.traceback.append(self.col_seq[j])
elif prevMatrix == 'Ic':
j -= 1
self.traceback.append(self.col_seq[j].lower())
elif prevMatrix == 'Ir':
i -= 1
self.traceback.append('-')
return ''.join(self.traceback[::-1])下面是生成动态规划矩阵并追溯对齐的python类。还有一个分数函数,用于检查所产生的对齐是否正确。
class global_aligner():
def __init__(self, subst, open=12, extend=1, double=3):
self.extend, self.open, self.double, self.subst = extend, open, double, subst
def __call__(self, row_seq, col_seq):
#add alphabet error checking?
score_align(row_seq, col_seq)
return traceback_col_seq()
def init_array(self):
"""initialize three numpy arrays, set values in 1st column and row"""
self.M = zeros((self.maxI, self.maxJ), float)
self.Ic = zeros((self.maxI, self.maxJ), float)
self.Ir = zeros((self.maxI, self.maxJ), float)
for i in xrange(1,self.maxI):
self.M[i][0], self.Ic[i][0], self.Ir[i][0] = \
-float('inf'), -float('inf'), -(self.open+self.extend*(i-1))
for j in xrange(1,self.maxJ):
self.M[0][j], self.Ir[0][j], self.Ic[0][j] = \
-float('inf'), -float('inf'), -(self.open+self.extend*(j-1))
self.Ic[0][0] = self.Ir[0][0] = -float('inf')
def score_cell(self, i, j, chars):
"""score a matrix cell based on the 9 total neighbors (3 each direction)"""
thisM = [self.M[i-1][j-1]+self.subst[chars], self.Ic[i-1][j-1]+ \
self.subst[chars], self.Ir[i-1][j-1]+self.subst[chars]]
thisC = [self.M[i][j-1]-self.open, self.Ic[i][j-1]-self.extend, \
self.Ir[i][j-1]-self.double]
thisR = [self.M[i-1][j]-self.open, self.Ic[i-1][j]-self.double, \
self.Ir[i-1][j]-self.extend]
return max(thisM), max(thisC), max(thisR)
def score_align(self, row_seq, col_seq):
"""build dynamic programming matrices to align two sequences"""
self.row_seq, self.col_seq = list(row_seq), list(col_seq)
self.maxI, self.maxJ = len(self.row_seq)+1, len(self.col_seq)+1
self.init_array() #initialize arrays
for i in xrange(1, self.maxI):
row_char = self.row_seq[i-1]
for j in xrange(1, self.maxJ):
chars = row_char+self.col_seq[j-1]
self.M[i][j], self.Ic[i][j], self.Ir[i][j] = self.score_cell(i, j, chars)
def traceback_col_seq(self):
"""trace back column sequence in matrices in a2m format"""
i, j = self.maxI-1, self.maxJ-1
self.traceback = list()
#find which matrix to start in
#THIS IS WHERE THE PROBLEM LIES I THINK
cell = (self.M[i][j], self.Ic[i][j], self.Ir[i][j])
prevMatrix = cell.index(max(cell))
while i > 1 and j > 1:
if prevMatrix == 0: #M matrix
i -= 1; j -= 1 #step up diagonally
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
diag = self.score_cell(i, j, prevChars) #re-score diagonal cell
prevMatrix = diag.index(max(diag)) #determine which matrix that was
self.traceback.append(self.col_seq[j])
elif prevMatrix == 1: #Ic matrix
j -= 1
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
left = self.score_cell(i, j, prevChars)
prevMatrix = left.index(max(left))
self.traceback.append(self.col_seq[j].lower())
elif prevMatrix == 2: #Ir matrix
i -= 1
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
up = self.score_cell(i, j, prevChars)
prevMatrix = up.index(max(up))
self.traceback.append('-')
for j in xrange(j,0,-1): #hit top of matrix before ending, add chars
self.traceback.append(self.col_seq[j-1])
for i in xrange(i,0,-1): #hit left of matrix before ending, add gaps
self.traceback.append('-')
print(''.join(self.row[::-1]))
return ''.join(self.traceback[::-1])
def score_a2m(self, s1, s2):
"""scores an a2m alignment of two sequences. I believe this function correctly
scores alignments and is used to test my alignments. The value produced by this
function should be the same as the largest value in the bottom right of the three
matrices"""
s1, s2 = list(s1.strip('.')), list(s2.strip('.'))
s1_pos, s2_pos = len(s1)-1, len(s2)-1
score, gap = 0, False
while s1_pos >= 0 and s2_pos >= 0:
if s1[s1_pos].islower() and gap is False:
score -= self.open; s1_pos -= 1; gap = True
elif s1[s1_pos].islower() and gap is True:
score -= self.extend; s1_pos -= 1
elif s2[s2_pos].islower() and gap is False:
score -= self.open; s2_pos -= 1; gap = True
elif s2[s2_pos].islower() and gap is True:
score -= self.extend; s2_pos -= 1
elif s1[s1_pos] == '-' and gap is False:
score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
elif s1[s1_pos] == '-' and gap is True:
score -= self.extend; s1_pos -= 1; s2_pos -= 1
elif s2[s2_pos] == '-' and gap is False:
score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
elif s2[s2_pos] == '-' and gap is True:
score -= self.extend; s1_pos -= 1; s2_pos -= 1
elif gap is True:
score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
s1_pos -= 1; s2_pos -= 1; gap = False
else:
score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
s1_pos -= 1; s2_pos -= 1
if s1_pos >= 0 and gap is True:
score -= self.extend*s1_pos
elif s1_pos >= 0 and gap is False:
score -= self.open+s1_pos*self.extend
if s2_pos >= 0 and gap is True:
score -= self.extend*s2_pos
elif s2_pos >= 0 and gap is False:
score -= self.open+s2_pos*self.extend
return score
test = global_aligner(blosumMatrix)
s1,s2 = 'ZZ','AAAC'
test.score_align(s1, s2)
align = test.traceback_col_seq()
print('This score: ', test.score_a2m(s1,align)
print('Correct score: ', test.score_a2m(s1,'AAac'))Blosum解析器
def parse_blosum(blosumFile):
blosumMatrix, commentFlag = dict(), False
for line in blosumFile:
if not line.startswith('#') and not commentFlag:
alphabet = line.rstrip().split()
commentFlag = True
elif commentFlag:
line = line.rstrip().split()
thisChar, line = line[0], line[1:]
for x in xrange(len(line)):
alphaChar, thisValue = alphabet[x], line[x]
blosumMatrix[thisChar+alphaChar] = int(thisValue)
return blosumMatrix发布于 2013-12-06 17:06:06
def traceback_col_seq(self):
"""
Traces back the column sequence to determine final global alignment.
Recalculates the score using score_cell.
"""
i, j = self.maxI-1, self.maxJ-1
self.traceback = list() #stores final sequence
matrixDict = {0:'M',1:'Ic',2:'Ir'} #mapping between numeric value and matrix
chars = self.col_seq[j-1] + self.row_seq[i-1] #store first characters
thisM, thisC, thisR = self.score_cell(i,j, chars)
cell = max(thisM), max(thisC), max(thisR) #determine where to start
prevMatrix = matrixDict[cell.index(max(cell))] #store where to go first
while i > 0 or j > 0:
#loop until the top left corner of the matrix is reached
if prevMatrix == 'M':
self.traceback.append(self.col_seq[j-1])
i -= 1; j -= 1
prevMatrix = matrixDict[thisM.index(max(thisM))]
elif prevMatrix == 'Ic':
self.traceback.append(self.col_seq[j-1].lower())
j -= 1
prevMatrix = matrixDict[thisC.index(max(thisC))]
elif prevMatrix == 'Ir':
self.traceback.append('-')
i -= 1
prevMatrix = matrixDict[thisR.index(max(thisR))]
chars = self.col_seq[j-1] + self.row_seq[i-1] #store next characters
thisM, thisC, thisR = self.score_cell(i,j, chars) #rescore next cell
return ''.join(self.traceback[::-1])发布于 2013-12-06 08:20:36
当你提到你对箭头颜色的评论时,根本的问题是,第二个水平箭头被标记为在M(黑色箭头)中的一个运动,而它实际上是Ic (绿色箭头)中的一个运动。这似乎是因为prevMatrix变量指示在(i-1,j-1),(i-1,j)或(i,j-1)处的最佳矩阵。从前面的角度来看,这是前一个单元格中的矩阵。因为您正在执行跟踪*回*此时,您在跟踪中“来自”的单元格实际上是(i,j) --是您当前所在的单元格。这决定了你移动的方向,从而决定了你是否对一个缺口进行调整。您最好的选择可能是有一个变量来跟踪从循环迭代到循环迭代的当前矩阵,并使用它来确定要追加到对齐字符串的字符。在每次迭代中,您可以在确定下一个单元之后更新它。
我认为,进行这些更改将澄清您在重写traceback_col_sequence函数时遇到的问题,但初步看来,您没有考虑到通过追溯所获得的信息。您正在重用score_cell(),它看起来是设计用来计算未来的分数的。当您正在追溯时,您已经知道结束,所以您不想选择前一个单元格的最大分数。您希望选择可能导致当前基准表中位置的最大分数。例如,如果你知道你在你的回溯中的矩阵是M,那么你就知道你没有通过扩大一个缺口到达那里,所以你不应该考虑那些选择。
https://stackoverflow.com/questions/20229817
复制相似问题