# Python编程任务 | 斯坦福CS231n-深度学习与计算机视觉课程

Assignment 3

04 Python编程任务（2-layer神经网络）

· Assignment1的神经网络部分，我们需要完成neural_net.py，完成后可以用two_layer_net.ipynb里的代码（部分代码需要自己完成）来调试你的模型，优化超参数，获得最优模型，最后在测试集上测试分类水平。

· 这里用的图像库还是CIFAR-10。

neural_net.py 代码如下：

```__coauthor__ = 'Deeplayer'
# 6.14.2016

#import numpy as np
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network.
The net has an input dimension of D,
a hidden layer dimension of H,
and performs classification over C classes.
The network has the following architecture:
input - fully connected layer - ReLU - fully connected layer - softmax
The outputs of the second fully-connected layer are the scores for each class.
"""
def __init__(self, input_size, hidden_size, output_size, std=1e-4):
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros((1, hidden_size))
self.params['W2'] = std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros((1, output_size))

def loss(self, X, y=None, reg=0.0):
"""
Compute the loss and gradients for a two layer fully connected neural network.
"""
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape

# Compute the forward pass
scores = None
h1 = ReLU(np.dot(X, W1) + b1) # hidden layer 1  (N,H)
out = np.dot(h1, W2) + b2  # output layer    (N,C)
scores = out  # (N,C)
if y is None:
return scores

# Compute the lossloss = None
# Considering the Numeric Stability
scores_max = np.max(scores, axis=1, keepdims=True)    # (N,1)
# Compute the class probabilities
exp_scores = np.exp(scores - scores_max)              # (N,C)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)    # (N,C)
# cross-entropy loss and L2-regularization
correct_logprobs = -np.log(probs[range(N), y])        # (N,1)
data_loss = np.sum(correct_logprobs) / N
reg_loss = 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2*W2)
loss = data_loss + reg_loss

# Backward pass: compute gradients
# Compute the gradient of scores
dscores = probs          # (N,C)
dscores[range(N), y] -= 1
dscores /= N
# Backprop into W2 and b2
dW2 = np.dot(h1.T, dscores)          # (H,C)
db2 = np.sum(dscores, axis=0, keepdims=True)    # (1,C)
# Backprop into hidden layer
dh1 = np.dot(dscores, W2.T)          # (N,H)
# Backprop into ReLU non-linearity
dh1[h1 <= 0] = 0
# Backprop into W1 and b1
dW1 = np.dot(X.T, dh1)         # (D,H)
db1 = np.sum(dh1, axis=0, keepdims=True)        # (1,H)
dW2 += reg * W2
dW1 += reg * W1

def train(self, X, y, X_val, y_val, learning_rate=1e-3,
learning_rate_decay=0.95, reg=1e-5, mu=0.9, num_epochs=10,
mu_increase=1.0, batch_size=200, verbose=False):
"""
Train this neural network using stochastic gradient descent.
Inputs:
- X: A numpy array of shape (N, D) giving training data.
- y: A numpy array f shape (N,) giving training labels; y[i] = c means that
X[i] has label c, where 0 <= c < C.
- X_val: A numpy array of shape (N_val, D) giving validation data.
- y_val: A numpy array of shape (N_val,) giving validation labels.
- learning_rate: Scalar giving learning rate for optimization.
- learning_rate_decay: Scalar giving factor used to decay the learning rate
after each epoch.
- reg: Scalar giving regularization strength.
- num_iters: Number of steps to take when optimizing.
- batch_size: Number of training examples to use per step.
- verbose: boolean; if true print progress during optimization.
"""
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters
v_W2, v_b2 = 0.0, 0.0
v_W1, v_b1 = 0.0, 0.0
loss_history = []
train_acc_history = []
val_acc_history = []

for it in xrange(1, num_epochs * iterations_per_epoch + 1):
X_batch = None
y_batch = None
# Sampling with replacement is faster than sampling without replacement.
sample_index = np.random.choice(num_train, batch_size, replace=True)
X_batch = X[sample_index, :]        # (batch_size,D)
y_batch = y[sample_index]           # (1,batch_size)

# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)

# Perform parameter update (with momentum)
v_W2 = mu * v_W2 - learning_rate * grads['W2']
self.params['W2'] += v_W2
v_b2 = mu * v_b2 - learning_rate * grads['b2']
self.params['b2'] += v_b2
v_W1 = mu * v_W1 - learning_rate * grads['W1']
self.params['W1'] += v_W1
v_b1 = mu * v_b1 - learning_rate * grads['b1']
self.params['b1'] += v_b1
"""
if verbose and it % 100 == 0:
print 'iteration %d / %d: loss %f' % (it, num_iters, loss)
"""
# Every epoch, check train and val accuracy and decay learning rate.
if verbose and it % iterations_per_epoch == 0:
# Check accuracy
epoch = it / iterations_per_epoch
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
print 'epoch %d / %d: loss %f, train_acc: %f, val_acc: %f' %
(epoch, num_epochs, loss, train_acc, val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
# Increase mu
mu *= mu_increase

return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}

def predict(self, X):
"""
Inputs:
- X: A numpy array of shape (N, D) giving N D-dimensional data points to  classify.
Returns:
- y_pred: A numpy array of shape (N,) giving predicted labels for each of
the elements of X. For all i, y_pred[i] = c means that X[i] is
predicted to have class c, where 0 <= c < C.
"""
y_pred = None
h1 = ReLU(np.dot(X, self.params['W1']) + self.params['b1'])
scores = np.dot(h1, self.params['W2']) + self.params['b2']
y_pred = np.argmax(scores, axis=1)

return y_pred

def ReLU(x):
"""ReLU non-linearity."""
return np.maximum(0, x)```

nn_twolayer_best.py 代码如下：

```__coauthor__ = 'Deeplayer'
# 6.16.2016

import numpy as np
import matplotlib.pyplot as plt
from neural_net import TwoLayerNet
from data_utils import load_CIFAR10
from vis_utils import visualize_grid

# Load the data
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the two-layer neural net classifier. These are the same steps as
we used for the SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'E:/PycharmProjects/ML/CS231n/cifar-10-batches-py'   # make a change
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]     # (1000,32,32,3)
y_val = y_train[mask]     # (1000L,)
X_train = X_train[mask]   # (49000,32,32,3)
y_train = y_train[mask]   # (49000L,)
X_test = X_test[mask]    # (1000,32,32,3)
y_test = y_test[mask]    # (1000L,)

# preprocessing: subtract the mean image
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image

# Reshape data to rows
X_train = X_train.reshape(num_training, -1)    # (49000,3072)
X_val = X_val.reshape(num_validation, -1)     # (1000,3072)
X_test = X_test.reshape(num_test, -1)         # (1000,3072)

return X_train, y_train, X_val, y_val, X_test, y_test

# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

# Look for the best net
best_net = None      # store the best model into this
input_size = 32 * 32 * 3
hidden_size = 100
num_classes = 10
net = TwoLayerNet(input_size, hidden_size, num_classes)

"""
max_count = 100
for count in xrange(1, max_count + 1):
reg = 10 ** np.random.uniform(-4, 1)
lr = 10 ** np.random.uniform(-5, -3)
stats = net.train(X_train, y_train, X_val, y_val, num_epochs=5,
batch_size=200, mu=0.5, mu_increase=1.0, learning_rate=lr,
learning_rate_decay=0.95, reg=reg, verbose=True)

print 'val_acc: %f, lr: %s, reg: %s, (%d / %d)' %
(stats['val_acc_history'][-1], format(lr, 'e'), format(reg, 'e'), count, max_count)

# according to the above experiment, reg ~= 0.9,  lr ~= 5e-4
"""

stats = net.train(X_train, y_train, X_val, y_val,
num_epochs=40, batch_size=400, mu=0.5,
mu_increase=1.0, learning_rate=5e-4,
learning_rate_decay=0.95, reg=0.9, verbose=True)

# Predict on the validation set
val_acc = (net.predict(X_val) == y_val).mean()
print 'Validation accuracy: ', val_acc    # about 52.7%

# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(stats['loss_history'])
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.subplot(2, 1, 2)
plt.plot(stats['train_acc_history'], label='train')
plt.plot(stats['val_acc_history'], label='val')
plt.ylim([0, 0.8])
plt.title('Classification accuracy history')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(bbox_to_anchor=(1.0, 0.4))
plt.grid(True)
plt.show()

best_net = net
# Run on the test set
test_acc = (best_net.predict(X_test) == y_test).mean()
print 'Test accuracy: ', test_acc    # about 54.6%

# Visualize the weights of the best network
def show_net_weights(net):
W1 = net.params['W1']
W1 = W1.reshape(32, 32, 3, -1).transpose(3, 0, 1, 2)
plt.gca().axis('off')
plt.show()show_net_weights(best_net)```

loss.png

W1.png

1、2006年，Hinton发表的两篇论文Reducing the Dimensionality of Data with Neural Networks、A Fast Learning Algorithm for Deep Belief Nets利用预训练方法缓解了局部最优解问题，具体思想就是：利用无监督的逐层贪婪学习算法，一层一层地预训练神经网络的权重（每一层通过一个稀疏自编码器完成训练），最后再用有标签的数据通过反向传播微调所有权重。

2、我们之前讲过的ReLU、Maxout等激活函数，可以很好地克服“梯度消失”现象，而后来的Batch Normalization更是凶猛。

2012年ImageNet比赛中，CNN以压倒性的优势取得胜利，深度学习的巨大浪潮才正式开始。而自那之后，预训练方法已经被完全抛弃了，大概是因为数据量足够大了。

436 篇文章93 人订阅

0 条评论

## 相关文章

13510

41710

39260

14910

37670

### Deep learning基于theano的keras学习笔记（3）-网络层

1.3 Dropout层 为输入数据施加Dropout。Dropout将在训练过程中每次更新参数时随机断开一定百分比（p）的输入神经元连接，Dropout层...

18420

356110

435150

### 机器学习决策树：提炼出分类器算法

? 前面三天推送了决策树的基本原理和选择最佳分裂特征的几种公式，用到决策树一般都会出现过拟合问题，因此需要对决策树进行剪枝，阐述了常用的几种剪枝的方法（这些方...

35580

### scikit-learn 梯度提升树(GBDT)调参小结

在梯度提升树(GBDT)原理小结中，我们对GBDT的原理做了总结，本文我们就从scikit-learn里GBDT的类库使用方法作一个总结，主要会关注调参...

80630