# [Python人工智能] 五.theano实现神经网络正规化Regularization处理

## 二. 定义Layer类及增加数据集

1.定义Layer类

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)`

2.增加数据集

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)`

`[[0.00000000e+00 1.80000000e-01 6.78152493e-02 ... 2.87234043e-01  1.00000000e+00 8.96799117e-02] [2.35922539e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01  1.00000000e+00 2.04470199e-01] [2.35697744e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01  9.89737254e-01 6.34657837e-02] ... [6.11892474e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01  1.00000000e+00 1.07891832e-01] [1.16072990e-03 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01  9.91300620e-01 1.31070640e-01] [4.61841693e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01  1.00000000e+00 1.69701987e-01]][[24. ] [21.6] [34.7] [33.4] [36.2] ... [16.8] [22.4] [20.6] [23.9] [22. ] [11.9]](400, 13) (400, 1)(106, 13) (106, 1)`

1.定义变量和Layer

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)#定义x和yx = T.dmatrix("x")y = T.dmatrix("y")#定义两个Layer#L1: 13个属性，神经层有50个神经元，激活函数用tanhL1 = Layer(x, 13, 50, T.tanh)#L2: 输入为L1输出，输入个数为50，输出为1即房价L2 = Layer(L1.outputs, 50, 1, None)`

2.计算误差

（1）普通方法 定义cost变量计算误差，即预测值与真实值的差别。常用的方法如下，通过计算输出结果（预测值）和真实结果误差的平方平均自实现。 cost = T.mean(T.square(L2.outputs-y)) 但是该方法会产生Overfitting问题。为了解决Overfitting，在计算cost时，我要做一些手脚，加上一个东西。

（2）L2 Regularization cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum()) 它是0.1乘以L1的权重平方求和加上L2的权重平方和，注意尽量用一个小于1的值来乘，如这里的0.1。 上面这个就是L2 Regularization方法，相当于有一个 0.1乘以所有的weight平方和，它称为惩罚机制。快要进入Overfitting时，通过这个机制来惩罚，不进入Overfitting，另一种方法是L1 Regularization。

（3）L1 Regularization cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum()) 根据流行程度来看，L2比L1更普及，这篇文章也主要使用L2进行实验，0.1可以取不同值，去分别测试对比实验。

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)#定义x和yx = T.dmatrix("x")y = T.dmatrix("y")#定义两个Layer#L1: 13个属性，神经层有50个神经元，激活函数用tanhL1 = Layer(x, 13, 50, T.tanh)#L2: 输入为L1输出，输入个数为50，输出为1即房价L2 = Layer(L1.outputs, 50, 1, None)#the way to compute cost#计算误差 但该方法的结果会产生Overfitting问题cost = T.mean(T.square(L2.outputs-y))#L2 regularization#0.1乘以L1的权重平方求和加上L2的权重平方和#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfittingcost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())#L1 regularizationcost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())`

3.梯度下降更新

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] #print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)#定义x和yx = T.dmatrix("x")y = T.dmatrix("y")#定义两个Layer#L1: 13个属性，神经层有50个神经元，激活函数用tanhL1 = Layer(x, 13, 50, T.tanh)#L2: 输入为L1输出，输入个数为50，输出为1即房价L2 = Layer(L1.outputs, 50, 1, None)#the way to compute cost#计算误差 但该方法的结果会产生Overfitting问题cost = T.mean(T.square(L2.outputs-y))#L2 regularization#0.1乘以L1的权重平方求和加上L2的权重平方和#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfittingcost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())#L1 regularizationcost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())#对比正规化和没有正规化的区别#梯度下降定义gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])#学习率learning_rate = 0.01#训练 updatestrain = theano.function(    inputs=[x,y],    updates=[(L1.W, L1.W - learning_rate * gW1),             (L1.b, L1.b - learning_rate * gb1),             (L2.W, L2.W - learning_rate * gW2),             (L2.b, L2.b - learning_rate * gb2)])#计算误差compute_cost = theano.function(inputs=[x,y], outputs=cost)print(compute_cost)`

4.预测结果

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] #print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)#定义x和yx = T.dmatrix("x")y = T.dmatrix("y")#定义两个Layer#L1: 13个属性，神经层有50个神经元，激活函数用tanhL1 = Layer(x, 13, 50, T.tanh)#L2: 输入为L1输出，输入个数为50，输出为1即房价L2 = Layer(L1.outputs, 50, 1, None)#the way to compute cost#计算误差 但该方法的结果会产生Overfitting问题cost = T.mean(T.square(L2.outputs-y))#L2 regularization#0.1乘以L1的权重平方求和加上L2的权重平方和#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfittingcost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())#L1 regularizationcost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())#对比正规化和没有正规化的区别#梯度下降定义gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])#学习率learning_rate = 0.01#训练 updatestrain = theano.function(    inputs=[x,y],    updates=[(L1.W, L1.W - learning_rate * gW1),             (L1.b, L1.b - learning_rate * gb1),             (L2.W, L2.W - learning_rate * gW2),             (L2.b, L2.b - learning_rate * gb2)])#计算误差compute_cost = theano.function(inputs=[x,y], outputs=cost)print(compute_cost)#存储cost误差train_err_list =[]test_err_list = []learning_time = [] #计算每一步的i#训练1000次 每隔10次输出for i in range(1000):    train(x_train, y_train)    if i % 10 == 0:        #训练误差        cost1 = compute_cost(x_train, y_train)        train_err_list.append(cost1)        #预测误差        cost2 = compute_cost(x_test, y_test)        test_err_list.append(cost2)         learning_time.append(i) #对应i        print(cost1)        print(cost2)        print(i)`

`76.9529084187930964.23189302430346050.77774571985432.3255236897757141037.60437135721288420.7402327145516420...`

5.绘制图形对比

`#coding:utf-8import numpy as npimport theano.tensor as Timport theanofrom theano import functionfrom sklearn.datasets import load_bostonimport matplotlib.pyplot as plt#首先定义神经网络Layer类class Layer(object):    def __init__(self, inputs, in_size, out_size, activation_function=None):        #权重: 平均值为0 方差为1 行数为in_size  列数为out_size        self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))        #bias        self.b = theano.shared(np.zeros((out_size,) ) + 0.1)        #乘法加bias        self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法        #激励函数        self.activation_function = activation_function        #默认为None,否则进行激活        if activation_function is None:             self.outputs = self.Wx_plus_b        else:             self.outputs = self.activation_function(self.Wx_plus_b)#正常化处理 数据降为0-1之间def minmax_normalization(data):    xs_max = np.max(data, axis=0)    xs_min = np.min(data, axis=0)    xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0    return xs #导入sklearn中的波士顿房价数据集#500多个数据点 每个sample有13个特征去描述房价np.random.seed(100)x_data = load_boston().data #数据集#minmax normalization, rescale the inputsx_data = minmax_normalization(x_data)print(x_data)#增加一个维度 定义成矩阵的形式y_data = load_boston().target[:, np.newaxis] #print(y_data)#cross validation, train test data split#划分训练集和测试集#前400个sameple或样本行作为训练集, 剩余的作为预测集x_train, y_train = x_data[:400], y_data[:400]x_test, y_test = x_data[400:], y_data[400:]print(x_train.shape, y_train.shape)print(x_test.shape, y_test.shape)#定义x和yx = T.dmatrix("x")y = T.dmatrix("y")#定义两个Layer#L1: 13个属性，神经层有50个神经元，激活函数用tanhL1 = Layer(x, 13, 50, T.tanh)#L2: 输入为L1输出，输入个数为50，输出为1即房价L2 = Layer(L1.outputs, 50, 1, None)#the way to compute cost#计算误差 但该方法的结果会产生Overfitting问题cost = T.mean(T.square(L2.outputs-y))#L2 regularization#0.1乘以L1的权重平方求和加上L2的权重平方和#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfitting#cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())#L1 regularization#cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())#对比正规化和没有正规化的区别#梯度下降定义gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])#学习率learning_rate = 0.01#训练 updatestrain = theano.function(    inputs=[x,y],    updates=[(L1.W, L1.W - learning_rate * gW1),             (L1.b, L1.b - learning_rate * gb1),             (L2.W, L2.W - learning_rate * gW2),             (L2.b, L2.b - learning_rate * gb2)])#计算误差compute_cost = theano.function(inputs=[x,y], outputs=cost)print(compute_cost)#存储cost误差train_err_list =[]test_err_list = []learning_time = [] #计算每一步的i#训练1000次 每隔10次输出for i in range(1000):    train(x_train, y_train)    if i % 10 == 0:        #训练误差        cost1 = compute_cost(x_train, y_train)        train_err_list.append(cost1)        #预测误差        cost2 = compute_cost(x_test, y_test)        test_err_list.append(cost2)         learning_time.append(i) #对应i        print(cost1)        print(cost2)        print(i)#plot cost historyplt.plot(learning_time, train_err_list, 'r-') #红色线为训练误差plt.plot(learning_time, test_err_list, 'b--') #蓝色虚线为测试结果plt.show()`

（1）Overfitting问题对应曲线，红色线为训练误差，蓝色虚线为测试结果，会发现预测的误差在不断变大。

cost = T.mean(T.square(L2.outputs-y))

（2）L2 Regularization，通过正规化处理后的结果，发现预测结果和训练结果的误差变化基本一致，其效果更好。 cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

（3）L1 regularization输出结果如下图所示： cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

451 篇文章44 人订阅

0 条评论