tf25: 使用深度学习做阅读理解+完形填空

记的在学生时代,英语考试有这么一种类型的题,叫:阅读理解。首先让你读一段洋文材料,然后回答一些基于这个洋文材料提的问题。

我先给你出一道阅读理解

Big Panda learned to code when he was 21. He live in China and have no life, feel like a big loser. But here is one thing Panda want you to remember…it´s never too late! You can do anything if you put your heart on it!

____ is the loser.(下划线处该填什么呢?)

我出的这道填空题,对人来说轻而易举,但是要让机器回答就很难了。机器阅读和理解人类语言是非常有挑战性的。

本帖就使用TensorFlow练习一个阅读理解,看看准确率能到什么程度。

使用的数据集

本帖只使用”非死不可”提供的《Children’s Book Test》数据集。

数据预处理

import re
import random
import ast
import itertools
import pickle
import numpy as np
 
train_data_file = './CBTest/data/cbtest_NE_train.txt'
valid_data_file = './CBTest/data/cbtest_NE_valid_2000ex.txt'
 
def preprocess_data(data_file, out_file):
	# stories[x][0]  tories[x][1]  tories[x][2]
	stories = []
	with open(data_file) as f:
		story = []
		for line in f:
			line = line.strip()
			if not line:
				story = []
			else:
				_, line = line.split(' ', 1)
				if line:
					if '\t' in line:
						q, a, _, answers = line.split('\t')
						# tokenize
						q = [s.strip() for s in re.split('(\W+)+', q) if s.strip()]
						stories.append((story, q, a))
					else:
						line = [s.strip() for s in re.split('(\W+)+', line) if s.strip()]
						story.append(line)
 
	samples = []
	for story in stories:
		story_tmp = []
		content = []
		for c in story[0]:
			content += c
		story_tmp.append(content)
		story_tmp.append(story[1])
		story_tmp.append(story[2])
 
		samples.append(story_tmp)
 
	random.shuffle(samples)
	print(len(samples))
 
	with open(out_file, "w") as f:
		for sample in samples:
			f.write(str(sample))
			f.write('\n')
 
preprocess_data(train_data_file, 'train.data')
preprocess_data(valid_data_file, 'valid.data')
 
# 创建词汇表
def read_data(data_file):
	stories = []
	with open(data_file) as f:
		for line in f:
			line = ast.literal_eval(line.strip())
			stories.append(line)
	return stories
 
 
stories = read_data('train.data') + read_data('valid.data')
 
content_length = max([len(s) for s, _, _ in stories])
question_length = max([len(q) for _, q, _ in stories])
print(content_length, question_length)
 
vocab = sorted(set(itertools.chain(*(story + q + [answer] for story, q, answer in stories))))
vocab_size = len(vocab) + 1
print(vocab_size)
word2idx = dict((w, i + 1) for i,w in enumerate(vocab))
pickle.dump((word2idx, content_length, question_length, vocab_size), open('vocab.data', "wb"))
 
# From keras 补齐
def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='post', truncating='post', value=0.):
    lengths = [len(s) for s in sequences]
 
    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)
 
    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break
 
    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)
 
        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))
 
        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x
 
# 转为向量
def to_vector(data_file, output_file):
	word2idx, content_length, question_length, _ = pickle.load(open('vocab.data', "rb"))
	
	X = []
	Q = []
	A = []
	with open(data_file) as f_i:
		for line in f_i:
			line = ast.literal_eval(line.strip())
			x = [word2idx[w] for w in line[0]]
			q = [word2idx[w] for w in line[1]]
			a = [word2idx[line[2]]]
 
			X.append(x)
			Q.append(q)
			A.append(a)
 
	X = pad_sequences(X, content_length)
	Q = pad_sequences(Q, question_length)
 
	with open(output_file, "w") as f_o:
		for i in range(len(X)):
			f_o.write(str([X[i].tolist(), Q[i].tolist(), A[i]]))
			f_o.write('\n')
 
to_vector('train.data', 'train.vec')
to_vector('valid.data', 'valid.vec')
 
 
"""
# to_word
word2idx, content_length, question_length, _ = pickle.load(open('vocab.data', "rb"))
 
def get_value(dic,value):
    for name in dic:
        if dic[name] == value:
            return name
 
with open('train.vec') as f:
	for line in f:
		line = ast.literal_eval(line.strip())
		for word in line[0]:
			print(get_value(word2idx, word))
"""

生成的文件:vocab.data词汇表、train.vec、valid.vec数据的向量表示。

训练

https://arxiv.org/pdf/1606.02245v4.pdf

import tensorflow as tf
import pickle
import numpy as np
import ast
from collections import defaultdict
 
train_data = 'train.vec'
valid_data = 'valid.vec'
 
word2idx, content_length, question_length, vocab_size = pickle.load(open('vocab.data', "rb"))
print(content_length, question_length, vocab_size)
 
batch_size = 64
 
train_file = open(train_data)
def get_next_batch():
	X = []
	Q = []
	A = []
	for i in range(batch_size):
		for line in train_file:
			line = ast.literal_eval(line.strip())
			X.append(line[0])
			Q.append(line[1])
			A.append(line[2][0])
			break
 
	if len(X) == batch_size:
		return X, Q, A
	else:
		train_file.seek(0)
		return get_next_batch()
 
def get_test_batch():
	with open(valid_data) as f:
		X = []
		Q = []
		A = []
		for line in f:
			line = ast.literal_eval(line.strip())
			X.append(line[0])
			Q.append(line[1])
			A.append(line[2][0])
		return X, Q, A
 
 
X = tf.placeholder(tf.int32, [batch_size, content_length])   # 洋文材料
Q = tf.placeholder(tf.int32, [batch_size, question_length])  # 问题
A = tf.placeholder(tf.int32, [batch_size])                   # 答案
 
# drop out
keep_prob = tf.placeholder(tf.float32)
 
def glimpse(weights, bias, encodings, inputs):
	weights = tf.nn.dropout(weights, keep_prob)
	inputs = tf.nn.dropout(inputs, keep_prob)
	attention = tf.transpose(tf.matmul(weights, tf.transpose(inputs)) + bias)
	attention = tf.batch_matmul(encodings, tf.expand_dims(attention, -1))
	attention = tf.nn.softmax(tf.squeeze(attention, -1))
	return attention, tf.reduce_sum(tf.expand_dims(attention, -1) * encodings, 1)
 
def neural_attention(embedding_dim=384, encoding_dim=128):
	embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=0.22), dtype=tf.float32)
	tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(1e-4), [embeddings])
 
	with tf.variable_scope('encode'):
		with tf.variable_scope('X'):
			X_lens = tf.reduce_sum(tf.sign(tf.abs(X)), 1)
			embedded_X = tf.nn.embedding_lookup(embeddings, X)
			encoded_X = tf.nn.dropout(embedded_X, keep_prob)
			gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim)
			outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_X, sequence_length=X_lens, dtype=tf.float32, swap_memory=True)
			encoded_X = tf.concat(2, outputs)
		with tf.variable_scope('Q'):
			Q_lens = tf.reduce_sum(tf.sign(tf.abs(Q)), 1)
			embedded_Q = tf.nn.embedding_lookup(embeddings, Q)
			encoded_Q = tf.nn.dropout(embedded_Q, keep_prob)
			gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim)
			outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_Q, sequence_length=Q_lens, dtype=tf.float32, swap_memory=True)
			encoded_Q = tf.concat(2, outputs)
 
	W_q = tf.Variable(tf.random_normal([2*encoding_dim, 4*encoding_dim], stddev=0.22), dtype=tf.float32)
	b_q = tf.Variable(tf.random_normal([2*encoding_dim, 1], stddev=0.22), dtype=tf.float32)
	W_d = tf.Variable(tf.random_normal([2*encoding_dim, 6*encoding_dim], stddev=0.22), dtype=tf.float32)
	b_d = tf.Variable(tf.random_normal([2*encoding_dim, 1], stddev=0.22), dtype=tf.float32)
	g_q = tf.Variable(tf.random_normal([10*encoding_dim, 2*encoding_dim], stddev=0.22), dtype=tf.float32)
	g_d = tf.Variable(tf.random_normal([10*encoding_dim, 2*encoding_dim], stddev=0.22), dtype=tf.float32)
 
	with tf.variable_scope('attend') as scope:
		infer_gru = tf.nn.rnn_cell.GRUCell(4*encoding_dim)
		infer_state = infer_gru.zero_state(batch_size, tf.float32)
		for iter_step in range(8):
			if iter_step > 0:
				scope.reuse_variables()
 
			_, q_glimpse = glimpse(W_q, b_q, encoded_Q, infer_state)
			d_attention, d_glimpse = glimpse(W_d, b_d, encoded_X, tf.concat_v2([infer_state, q_glimpse], 1))
 
			gate_concat = tf.concat_v2([infer_state, q_glimpse, d_glimpse, q_glimpse * d_glimpse], 1)
 
			r_d = tf.sigmoid(tf.matmul(gate_concat, g_d))
			r_d = tf.nn.dropout(r_d, keep_prob)
			r_q = tf.sigmoid(tf.matmul(gate_concat, g_q))
			r_q = tf.nn.dropout(r_q, keep_prob)
 
			combined_gated_glimpse = tf.concat_v2([r_q * q_glimpse, r_d * d_glimpse], 1)
			_, infer_state = infer_gru(combined_gated_glimpse, infer_state)
 
	return tf.to_float(tf.sign(tf.abs(X))) * d_attention
 
def train_neural_attention():
	X_attentions = neural_attention()
	loss = -tf.reduce_mean(tf.log(tf.reduce_sum(tf.to_float(tf.equal(tf.expand_dims(A, -1), X)) * X_attentions, 1) + tf.constant(0.00001)))
 
	optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
	grads_and_vars = optimizer.compute_gradients(loss)
	capped_grads_and_vars = [(tf.clip_by_norm(g, 5), v) for g,v in grads_and_vars]
	train_op = optimizer.apply_gradients(capped_grads_and_vars)
 
	saver = tf.train.Saver()
	with tf.Session() as sess:
		sess.run(tf.global_variables_initializer())
 
		# writer = tf.summary.FileWriter()
		# 恢复前一次训练
		ckpt = tf.train.get_checkpoint_state('.')
		if ckpt != None:
			print(ckpt.model_checkpoint_path)
			saver.restore(sess, ckpt.model_checkpoint_path)
		else:
			print("没找到模型")
 
		for step in range(20000):
			train_x, train_q, train_a = get_next_batch()
			loss_, _ = sess.run([loss, train_op], feed_dict={X:train_x, Q:train_q, A:train_a, keep_prob:0.7})
			print(loss_)
 
			# 保存模型并计算准确率
			if step % 1000 == 0:
				path = saver.save(sess, 'machine_reading.model', global_step=step)
				print(path)
 
				test_x, test_q, test_a = get_test_batch()
				test_x, test_q, test_a = np.array(test_x[:batch_size]), np.array(test_q[:batch_size]), np.array(test_a[:batch_size])
				attentions = sess.run(X_attentions, feed_dict={X:test_x, Q:test_q, keep_prob:1.})
				correct_count = 0
				for x in range(test_x.shape[0]):
					probs = defaultdict(int)
					for idx, word in enumerate(test_x[x,:]):
						probs[word] += attentions[x, idx]
					guess = max(probs, key=probs.get)
					if guess == test_a[x]:
						correct_count += 1
				print(correct_count / test_x.shape[0])
 
train_neural_attention()

我只想说,这个东西比我水平高!

Share the post "TensorFlow练习25: 使用深度学习做阅读理解+完形填空"

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏cs

NLP问题之word2vec

其用于有如下的 从「中文分词」、「词云画像」、「词性分析」到「自动摘要」、「关系挖掘」、「情感分析」、「知识图谱」等

1862
来自专栏AI科技评论

干货 | 元旦,一起NLP!(下)

0.Roadmap 1. 模型 | 语言模型与词嵌入 2. 模型 | LSTM 3. 盘点 | 那些顶级会议 4. 模型 | Seq2Seq 和 Attenti...

3665
来自专栏人工智能

Bi-LSTM+CRF在文本序列标注中的应用

机器之心专栏 作者:触宝AI实验室Principal Engineer董冰峰 传统 CRF 中的输入 X 向量一般是 word 的 one-hot 形式,前面提...

7618
来自专栏绿巨人专栏

机器学习中的基本数学知识

7047
来自专栏算法channel

机器学习|聚类算法之DBSCAN

DBSCAN,全称:Density-Based Spatial Clustering of Applications with Noise,是一个比较有代表性的...

4389
来自专栏人工智能LeadAI

pyTorch自然语言处理简单例子

最近在学pyTorch的实际应用例子。这次说个简单的例子:给定一句话,判断是什么语言。这个例子是比如给定一句话: Give it to me 判断是 ENGL...

4237
来自专栏专知

【干货】对抗自编码器PyTorch手把手实战系列——PyTorch实现对抗自编码器

即使是非计算机行业, 大家也知道很多有名的神经网络结构, 比如CNN在处理图像上非常厉害, RNN能够建模序列数据. 然而CNN, RNN之类的神经网络结构本身...

7915
来自专栏AI研习社

无监督聚类问题中,如何决定簇的最优数量?

编者按:聚类问题有一大经典难题:没有数据集的真实分类情况,我们怎么才能知道数据簇的最优数目? 本文会谈谈解决该问题的两种流行方法:elbow method(肘子...

3608
来自专栏CreateAMind

浅析互信息与特征选择

那么什么是互信息呢?变量x与变量y之间的互信息,可以用来衡量已知变量x时变量y的不确定性减少的程度,同样的,也可以衡量已知变量y时变量x的不确定性减少的程度。

3502
来自专栏AI研习社

手把手教你用 PyTorch 辨别自然语言(附代码)

最近在学pyTorch的实际应用例子。这次说个简单的例子:给定一句话,判断是什么语言。这个例子是比如给定一句话: Give it to me 判断是 ENGLI...

3615

扫码关注云+社区

领取腾讯云代金券