# 1、Encoder输入

Encoder输入过程如下图所示：

[机、器、学、习] -> [ machine、learning] [学、习、机、器] -> [learning、machine]

encoder的输入在转换成id后变为[[0,1,2,3],[2,3,0,1]]。

```import tensorflow as tf

chinese_embedding = tf.constant([[0.11,0.21,0.31,0.41],
[0.21,0.31,0.41,0.51],
[0.31,0.41,0.51,0.61],
[0.41,0.51,0.61,0.71]],dtype=tf.float32)

english_embedding = tf.constant([[0.51,0.61,0.71,0.81],
[0.52,0.62,0.72,0.82],
[0.53,0.63,0.73,0.83],
[0.54,0.64,0.74,0.84]],dtype=tf.float32)

position_encoding = tf.constant([[0.01,0.01,0.01,0.01],
[0.02,0.02,0.02,0.02],
[0.03,0.03,0.03,0.03],
[0.04,0.04,0.04,0.04]],dtype=tf.float32)

encoder_input = tf.constant([[0,1,2,3],[2,3,0,1]],dtype=tf.int32)

with tf.variable_scope("encoder_input"):
encoder_embedding_input = tf.nn.embedding_lookup(chinese_embedding,encoder_input)
encoder_embedding_input = encoder_embedding_input + position_encoding

with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run([encoder_embedding_input]))
```

# 2、Encoder Block

## 3.1 Attention简单回顾

Attention其实就是计算一种相关程度，看下面的例子：

Attention通常可以进行如下描述，表示为将query(Q)和key-value pairs映射到输出上，其中query、每个key、每个value都是向量，输出是V中所有values的加权，其中权重是由Query和每个key计算出来的，计算方法分为三步：

1）计算比较Q和K的相似度，用f来表示：

2）将得到的相似度进行softmax归一化：

3）针对计算出来的权重，对所有的values进行加权求和，得到Attention向量：

## 2.2 Scaled Dot-Product Attention

```with tf.variable_scope("encoder_scaled_dot_product_attention"):
encoder_Q = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_Q)
encoder_K = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_K)
encoder_V = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_V)

encoder_Q = tf.reshape(encoder_Q,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_K = tf.reshape(encoder_K,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_V = tf.reshape(encoder_V,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))

attention_map = tf.matmul(encoder_Q,tf.transpose(encoder_K,[0,2,1]))
attention_map = attention_map / 8
attention_map = tf.nn.softmax(attention_map)

with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(attention_map))
print(sess.run(encoder_first_sa_output))
```

```w_Z = tf.constant([[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4]],dtype=tf.float32)

with tf.variable_scope("encoder_input"):
encoder_embedding_input = tf.nn.embedding_lookup(chinese_embedding,encoder_input)
encoder_embedding_input = encoder_embedding_input + position_encoding

encoder_Q = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_Q)
encoder_K = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_K)
encoder_V = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_V)

encoder_Q = tf.reshape(encoder_Q,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_K = tf.reshape(encoder_K,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_V = tf.reshape(encoder_V,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))

encoder_Q_split = tf.split(encoder_Q,2,axis=2)
encoder_K_split = tf.split(encoder_K,2,axis=2)
encoder_V_split = tf.split(encoder_V,2,axis=2)

encoder_Q_concat = tf.concat(encoder_Q_split,axis=0)
encoder_K_concat = tf.concat(encoder_K_split,axis=0)
encoder_V_concat = tf.concat(encoder_V_split,axis=0)

attention_map = tf.matmul(encoder_Q_concat,tf.transpose(encoder_K_concat,[0,2,1]))
attention_map = attention_map / 8
attention_map = tf.nn.softmax(attention_map)

weightedSumV = tf.matmul(attention_map,encoder_V_concat)

outputs_z = tf.concat(tf.split(weightedSumV,2,axis=0),axis=2)

outputs = tf.matmul(tf.reshape(outputs_z,(-1,tf.shape(outputs_z)[2])),w_Z)
outputs = tf.reshape(outputs,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))

import numpy as np
with tf.Session() as sess:
#     print(sess.run(encoder_Q))
#     print(sess.run(encoder_Q_split))
#print(sess.run(weightedSumV))
#print(sess.run(outputs_z))
print(sess.run(outputs))
```

split函数主要有三个参数，第一个是要split的tensor，第二个是分割成几个tensor，第三个是在哪一维进行切分。也就是说， encoder_Q_split = tf.split(encoder_Q,2,axis=2)，执行这段代码的话，encoder_Q这个tensor会按照axis=2切分成两个同样大的tensor，这两个tensor的axis=0和axis=1维度的长度是不变的，但axis=2的长度变为了一半，我们在后面通过图示的方式来解释。

# 2.4 Add & Normalize & FFN

```with tf.variable_scope("encoder_block"):
encoder_Q = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_Q)
encoder_K = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_K)
encoder_V = tf.matmul(tf.reshape(encoder_embedding_input,(-1,tf.shape(encoder_embedding_input)[2])),w_V)

encoder_Q = tf.reshape(encoder_Q,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_K = tf.reshape(encoder_K,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))
encoder_V = tf.reshape(encoder_V,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))

encoder_Q_split = tf.split(encoder_Q,2,axis=2)
encoder_K_split = tf.split(encoder_K,2,axis=2)
encoder_V_split = tf.split(encoder_V,2,axis=2)

encoder_Q_concat = tf.concat(encoder_Q_split,axis=0)
encoder_K_concat = tf.concat(encoder_K_split,axis=0)
encoder_V_concat = tf.concat(encoder_V_split,axis=0)

attention_map = tf.matmul(encoder_Q_concat,tf.transpose(encoder_K_concat,[0,2,1]))
attention_map = attention_map / 8
attention_map = tf.nn.softmax(attention_map)

weightedSumV = tf.matmul(attention_map,encoder_V_concat)

outputs_z = tf.concat(tf.split(weightedSumV,2,axis=0),axis=2)

sa_outputs = tf.matmul(tf.reshape(outputs_z,(-1,tf.shape(outputs_z)[2])),w_Z)
sa_outputs = tf.reshape(sa_outputs,(tf.shape(encoder_embedding_input)[0],tf.shape(encoder_embedding_input)[1],-1))

sa_outputs = sa_outputs + encoder_embedding_input

W_f = tf.constant([[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4]])

ffn_outputs = tf.matmul(tf.reshape(sa_outputs,(-1,tf.shape(sa_outputs)[2])),W_f)
ffn_outputs = tf.reshape(ffn_outputs,(tf.shape(sa_outputs)[0],tf.shape(sa_outputs)[1],-1))

encoder_outputs = ffn_outputs + sa_outputs

import numpy as np
with tf.Session() as sess:
#     print(sess.run(encoder_Q))
#     print(sess.run(encoder_Q_split))
#print(sess.run(weightedSumV))
#print(sess.run(outputs_z))
#print(sess.run(sa_outputs))
#print(sess.run(ffn_outputs))
print(sess.run(encoder_outputs))
```

# 3、Decoder Block

## 3.1 Decoder输入

[机、器、学、习] -> [ machine、learning]

```english_embedding = tf.constant([[0.51,0.61,0.71,0.81],
[0.61,0.71,0.81,0.91],
[0.71,0.81,0.91,1.01],
[0.81,0.91,1.01,1.11]],dtype=tf.float32)

position_encoding = tf.constant([[0.01,0.01,0.01,0.01],
[0.02,0.02,0.02,0.02],
[0.03,0.03,0.03,0.03],
[0.04,0.04,0.04,0.04]],dtype=tf.float32)

decoder_input = tf.constant([[1,2],[2,1]],dtype=tf.int32)

with tf.variable_scope("decoder_input"):
decoder_embedding_input = tf.nn.embedding_lookup(english_embedding,decoder_input)
decoder_embedding_input = decoder_embedding_input + position_encoding[0:tf.shape(decoder_embedding_input)[1]]
```

```w_Q_decoder_sa = tf.constant([[0.15,0.25,0.35,0.45,0.55,0.65],
[0.25,0.35,0.45,0.55,0.65,0.75],
[0.35,0.45,0.55,0.65,0.75,0.85],
[0.45,0.55,0.65,0.75,0.85,0.95]],dtype=tf.float32)

w_K_decoder_sa = tf.constant([[0.13,0.23,0.33,0.43,0.53,0.63],
[0.23,0.33,0.43,0.53,0.63,0.73],
[0.33,0.43,0.53,0.63,0.73,0.83],
[0.43,0.53,0.63,0.73,0.83,0.93]],dtype=tf.float32)

w_V_decoder_sa = tf.constant([[0.17,0.27,0.37,0.47,0.57,0.67],
[0.27,0.37,0.47,0.57,0.67,0.77],
[0.37,0.47,0.57,0.67,0.77,0.87],
[0.47,0.57,0.67,0.77,0.87,0.97]],dtype=tf.float32)
```

```with tf.variable_scope("decoder_sa_block"):
decoder_Q = tf.matmul(tf.reshape(decoder_embedding_input,(-1,tf.shape(decoder_embedding_input)[2])),w_Q_decoder_sa)
decoder_K = tf.matmul(tf.reshape(decoder_embedding_input,(-1,tf.shape(decoder_embedding_input)[2])),w_K_decoder_sa)
decoder_V = tf.matmul(tf.reshape(decoder_embedding_input,(-1,tf.shape(decoder_embedding_input)[2])),w_V_decoder_sa)

decoder_Q = tf.reshape(decoder_Q,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))
decoder_K = tf.reshape(decoder_K,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))
decoder_V = tf.reshape(decoder_V,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))

decoder_Q_split = tf.split(decoder_Q,2,axis=2)
decoder_K_split = tf.split(decoder_K,2,axis=2)
decoder_V_split = tf.split(decoder_V,2,axis=2)

decoder_Q_concat = tf.concat(decoder_Q_split,axis=0)
decoder_K_concat = tf.concat(decoder_K_split,axis=0)
decoder_V_concat = tf.concat(decoder_V_split,axis=0)

decoder_sa_attention_map_raw = tf.matmul(decoder_Q_concat,tf.transpose(decoder_K_concat,[0,2,1]))
decoder_sa_attention_map = decoder_sa_attention_map_raw / 8
```

```diag_vals = tf.ones_like(decoder_sa_attention_map[0,:,:])
tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()
decoder_sa_attention_map = tf.nn.softmax(decoder_sa_attention_map)
```

```import numpy as np
with tf.Session() as sess:
print(sess.run(decoder_sa_attention_map))
```

```weightedSumV = tf.matmul(decoder_sa_attention_map,decoder_V_concat)

decoder_outputs_z = tf.concat(tf.split(weightedSumV,2,axis=0),axis=2)

decoder_sa_outputs = tf.matmul(tf.reshape(decoder_outputs_z,(-1,tf.shape(decoder_outputs_z)[2])),w_Z_decoder_sa)

decoder_sa_outputs = tf.reshape(decoder_sa_outputs,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))

with tf.Session() as sess:
print(sess.run(decoder_sa_outputs))
```

## 3.3 encoder-decoder attention

```w_Q_decoder_sa2 = tf.constant([[0.2,0.3,0.4,0.5,0.6,0.7],
[0.3,0.4,0.5,0.6,0.7,0.8],
[0.4,0.5,0.6,0.7,0.8,0.9],
[0.5,0.6,0.7,0.8,0.9,1]],dtype=tf.float32)

w_K_decoder_sa2 = tf.constant([[0.18,0.28,0.38,0.48,0.58,0.68],
[0.28,0.38,0.48,0.58,0.68,0.78],
[0.38,0.48,0.58,0.68,0.78,0.88],
[0.48,0.58,0.68,0.78,0.88,0.98]],dtype=tf.float32)

w_V_decoder_sa2 = tf.constant([[0.22,0.32,0.42,0.52,0.62,0.72],
[0.32,0.42,0.52,0.62,0.72,0.82],
[0.42,0.52,0.62,0.72,0.82,0.92],
[0.52,0.62,0.72,0.82,0.92,1.02]],dtype=tf.float32)

w_Z_decoder_sa2 = tf.constant([[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4],
[0.1,0.2,0.3,0.4]],dtype=tf.float32)

with tf.variable_scope("decoder_encoder_attention_block"):

decoder_sa_outputs = decoder_sa_outputs + decoder_embedding_input

encoder_decoder_Q = tf.matmul(tf.reshape(decoder_sa_outputs,(-1,tf.shape(decoder_sa_outputs)[2])),w_Q_decoder_sa2)
encoder_decoder_K = tf.matmul(tf.reshape(encoder_outputs,(-1,tf.shape(encoder_outputs)[2])),w_K_decoder_sa2)
encoder_decoder_V = tf.matmul(tf.reshape(encoder_outputs,(-1,tf.shape(encoder_outputs)[2])),w_V_decoder_sa2)

encoder_decoder_Q = tf.reshape(encoder_decoder_Q,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))
encoder_decoder_K = tf.reshape(encoder_decoder_K,(tf.shape(encoder_outputs)[0],tf.shape(encoder_outputs)[1],-1))
encoder_decoder_V = tf.reshape(encoder_decoder_V,(tf.shape(encoder_outputs)[0],tf.shape(encoder_outputs)[1],-1))

encoder_decoder_Q_split = tf.split(encoder_decoder_Q,2,axis=2)
encoder_decoder_K_split = tf.split(encoder_decoder_K,2,axis=2)
encoder_decoder_V_split = tf.split(encoder_decoder_V,2,axis=2)

encoder_decoder_Q_concat = tf.concat(encoder_decoder_Q_split,axis=0)
encoder_decoder_K_concat = tf.concat(encoder_decoder_K_split,axis=0)
encoder_decoder_V_concat = tf.concat(encoder_decoder_V_split,axis=0)

encoder_decoder_attention_map_raw = tf.matmul(encoder_decoder_Q_concat,tf.transpose(encoder_decoder_K_concat,[0,2,1]))
encoder_decoder_attention_map = encoder_decoder_attention_map_raw / 8

encoder_decoder_attention_map = tf.nn.softmax(encoder_decoder_attention_map)

weightedSumV = tf.matmul(encoder_decoder_attention_map,encoder_decoder_V_concat)

encoder_decoder_outputs_z = tf.concat(tf.split(weightedSumV,2,axis=0),axis=2)

encoder_decoder_outputs = tf.matmul(tf.reshape(encoder_decoder_outputs_z,(-1,tf.shape(encoder_decoder_outputs_z)[2])),w_Z_decoder_sa2)

encoder_decoder_attention_outputs = tf.reshape(encoder_decoder_outputs,(tf.shape(decoder_embedding_input)[0],tf.shape(decoder_embedding_input)[1],-1))

encoder_decoder_attention_outputs = encoder_decoder_attention_outputs + decoder_sa_outputs

W_f = tf.constant([[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4]])

decoder_ffn_outputs = tf.matmul(tf.reshape(encoder_decoder_attention_outputs,(-1,tf.shape(encoder_decoder_attention_outputs)[2])),W_f)
decoder_ffn_outputs = tf.reshape(decoder_ffn_outputs,(tf.shape(encoder_decoder_attention_outputs)[0],tf.shape(encoder_decoder_attention_outputs)[1],-1))

decoder_outputs = decoder_ffn_outputs + encoder_decoder_attention_outputs

with tf.Session() as sess:
print(sess.run(decoder_outputs))
```

# 4、全连接层及最终输出

```W_final = tf.constant([[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4],
[0.2,0.3,0.5,0.4]])

logits = tf.matmul(tf.reshape(decoder_outputs,(-1,tf.shape(decoder_outputs)[2])),W_final)
logits = tf.reshape(logits,(tf.shape(decoder_outputs)[0],tf.shape(decoder_outputs)[1],-1))

logits = tf.nn.softmax(logits)

y = tf.one_hot(decoder_input,depth=4)

loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=y)

```

488 篇文章114 人订阅

0 条评论