# 数据载入过慢？这里有一份TensorFlow加速指南

##### 王小新 编译自 Towards Data Science 量子位 出品 | 公众号 QbitAI

https://github.com/FrancescoSaverioZuppichini/Tensorflow-Dataset-Tutorial/blob/master/dataset_tutorial.ipynb

# 概述

1. 导入数据，从某些数据创建一个数据集实例；

2. 创建迭代器iterator，即使用已有的数据集来创建一个迭代器实例，对数据集进行迭代；

3. 消耗数据，即使用所创建的迭代器，从数据集中取出元素输入到模型。

# 导入数据

## 使用Numpy

```# create a random vector of shape (100,2)
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)```

```features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))
dataset = tf.data.Dataset.from_tensor_slices((features,labels))```

## 使用Tensors

```# using a tensor
dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))```

## 使用Placeholder

```x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)```

## 使用generator

```sequence = np.array([[1],[2,3],[3,4]])
def generator():
for el in sequence:
yield el
dataset = tf.data.Dataset().from_generator(generator,
output_types=tf.float32,
output_shapes=[tf.float32])```

# 创建迭代器

## One shot迭代器

```x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
# create the iterator
iter = dataset.make_one_shot_iterator()```

```...
# create the iterator
iter = dataset.make_one_shot_iterator()
el = iter.get_next()```

```with tf.Session() as sess:
print(sess.run(el)) # output: [ 0.42116176  0.40666069]```

## 可初始化迭代器

```# using a placeholder
x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)

data = np.random.sample((100,2))

iter = dataset.make_initializable_iterator() # create the iterator
el = iter.get_next()
with tf.Session() as sess:
# feed the placeholder with data
sess.run(iter.initializer, feed_dict={ x: data })
print(sess.run(el)) # output [ 0.52374458  0.71968478]```

```train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))```

```# initializable iterator to switch between dataset
EPOCHS = 10

x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
dataset = tf.data.Dataset.from_tensor_slices((x, y))

train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))

iter = dataset.make_initializable_iterator()
features, labels = iter.get_next()

with tf.Session() as sess:
#     initialise iterator with train data
sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})
for _ in range(EPOCHS):
sess.run([features, labels])
#     switch to test data
sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})
print(sess.run([features, labels])```

## 可重初始化迭代器

```# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))```

```# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)```

```# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)```

```# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)```

`features, labels = iter.get_next()`

```# Reinitializable iterator to switch between Datasets
EPOCHS = 10
# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))
# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)
features, labels = iter.get_next()
# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)
with tf.Session() as sess:
sess.run(train_init_op) # switch to train dataset
for _ in range(EPOCHS):
sess.run([features, labels])
sess.run(test_init_op) # switch to val dataset
print(sess.run([features, labels]))```

# 消耗数据

```...
next_el = iter.get_next()
...
print(sess.run(next_el)) # will output the current element```

```# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))
dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)```

```iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()```

```# make a simple model
net = tf.layers.dense(x, 8) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8)
prediction = tf.layers.dense(net, 1)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label

```EPOCHS = 10
BATCH_SIZE = 16
# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))

dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()

# make a simple model
net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)

loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(EPOCHS):
_, loss_value = sess.run([train_op, loss])
print("Iter: {}, Loss: {:.4f}".format(i, loss_value))```

```Iter: 0, Loss: 0.1328
Iter: 1, Loss: 0.1312
Iter: 2, Loss: 0.1296
Iter: 3, Loss: 0.1281
Iter: 4, Loss: 0.1267
Iter: 5, Loss: 0.1254
Iter: 6, Loss: 0.1242
Iter: 7, Loss: 0.1231
Iter: 8, Loss: 0.1220
Iter: 9, Loss: 0.1210```

# 更多内容

## 批处理

```# BATCHING
BATCH_SIZE = 4
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
print(sess.run(el))```

```[[ 0.65686128  0.99373963]
[ 0.69690451  0.32446826]
[ 0.57148422  0.68688242]
[ 0.20335116  0.82473219]]```

## Shuffle操作

```# BATCHING
BATCH_SIZE = 4
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
print(sess.run(el))```

```[[4]
[2]
[3]
[1]]```

```[[3]
[1]
[2]
[4]]```

## Map操作

```# MAP
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.map(lambda x: x*2)

iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
#     this will run forever
for _ in range(len(x)):
print(sess.run(el))```

```[2]
[4]
[6]
[8]```

# 相关链接

TensorFlow dataset官方教程： https://www.tensorflow.org/programmers_guide/datasets

Dataset的API文档： https://www.tensorflow.org/api_docs/python/tf/data/Dataset

3387 篇文章97 人订阅

0 条评论

5363

4795

4478

1.5K9

871

652

1774

2996