# 优化算法:到底是数学还是代码？

• 随机梯度下降算法(SGD)
• Momentum算法
• RMSProp算法

θ（theta）是你想要找到使J最小化的最优值。J在这里这被称为目标函数。最后我们得到了一个被称为α（alpha）的学习速率。反复评估这个函数直到你达到预期的成本。

θ：作为山的位置

：作为角度θ大小的陡坡

α：作为

import numpy as np

def minimaFunction(theta):
return np.cos(3*np.pi*theta)/theta

def minimaFunctionDerivative(theta):
const1= 3*np.pi
const2= const1*theta
return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2

theta= np.arange(.1,2.1,.01)
Jtheta= minimaFunction(theta)
dJtheta= minimaFunctionDerivative(theta)

plt.plot(theta,Jtheta,label= r'$J(\theta)$')
plt.plot(theta,dJtheta/30,label= r'$dJ(\theta)/30$')
plt.legend()
axes= plt.gca()
#axes.set_ylim([-10,10])

plt.ylabel(r'$J(\theta),dJ(\theta)/30$')
plt.xlabel(r'$\theta$')
plt.title(r'$J(\theta),dJ(\theta)/30$ vs $\theta$')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

def optimize(iterations, oF, dOF,params,learningRate):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams= [params]
#The iteration loop
for iin range(iterations):
# Compute the derivative of the parameters
dParams= dOF(params)
# Compute the update
params= params-learningRate*dParams

# app end the new parameters
oParams.append(params)

return np.array(oParams)

def minimaFunction(theta):
return np.cos(3*np.pi*theta)/theta

def minimaFunctionDerivative(theta):
const1= 3*np.pi
const2= const1*theta
return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2

theta= .6
iterations=45
learningRate= .0007
optimizedParameters= optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
theta,\
learningRate)

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation

def minimaFunction(params):
#Bivariate Normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

Z1= mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)

Z= Z1

return -40*Z

def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

dZ1X= -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11- X)/sigma11**2
dZ1Y= -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12- Y)/sigma12**2

return (dZ1X,dZ1Y)

def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams= [params]
vdw    = (0.0,0.0)
#The iteration loop
for iin range(iterations):
# Compute the derivative of the parameters
dParams= dOF(params)

#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params= tuple([par-learningRate*dParfor dPar,parin zip(dParams,params)])

# append the new parameters
oParams.append(params)

return oParams

iterations=100
learningRate= 1
beta= .9
x,y= 4.0,1.0
params= (x,y)
optimizedParameters= optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)

γ（gamma）和ν（nu）值允许用户对dJ(θ)的前一个值和当前值进行加权，以确定θ的新值。人们很普遍地选择γ和ν的值来创建一个指数的加权移动平均，如下所示:

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation

def minimaFunction(params):
#Bivariate Normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

Z1= mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)

Z= Z1

return -40*Z

def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

dZ1X= -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11- X)/sigma11**2
dZ1Y= -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12- Y)/sigma12**2

return (dZ1X,dZ1Y)

def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter for momentum
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams= [params]
vdw    = (0.0,0.0)
#The iteration loop
for iin range(iterations):
# Compute the derivative of the parameters
dParams= dOF(params)

# Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar
vdw= tuple([vDW*beta+(1.0-beta)*dParfor dPar,vDWin zip(dParams,vdw)])

#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params= tuple([par-learningRate*dParfor dPar,parin zip(vdw,params)])

# append the new parameters
oParams.append(params)

return oParams

iterations=100
learningRate= 5.3
beta= .9
x,y= 4.0,1.0
params= (x,y)
optimizedParameters= optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)

RMSProp算法 通过观察每个参数对每个参数的梯度相对大小，RMSProp算法尝试对Momentum函数进行改进。正因为如此，我们可以采取每个梯度的平方的加权指数移动平均，并按比例将梯度下降函数标准化。带有大梯度的参数将比带有小梯度的参数大得多，并允许平滑下降到最优值。这可以从下面的等式中看出:

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation

def minimaFunction(params):
#Bivariate Normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

Z1= mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)

Z= Z1

return -40*Z

def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

dZ1X= -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11- X)/sigma11**2
dZ1Y= -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12- Y)/sigma12**2

return (dZ1X,dZ1Y)

def optimize(iterations, oF, dOF,params,learningRate,beta):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta - The weighted moving average parameter for RMSProp
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams= [params]
sdw    = (0.0,0.0)
eps= 10**(-7)
#The iteration loop
for iin range(iterations):
# Compute the derivative of the parameters
dParams= dOF(params)

# Compute the momentum of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2
sdw= tuple([sDW*beta+(1.0-beta)*dPar**2 for dPar,sDWin zip(dParams,sdw)])

#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params= tuple([par-learningRate*dPar/((sDW**.5)+eps)for sDW,par,dParin zip(sdw,params,dParams)])

# append the new parameters
oParams.append(params)

return oParams

iterations=10
learningRate= .3
beta= .9
x,y= 5.0,1.0
params= (x,y)
optimizedParameters= optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta)

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.animation as animation

def minimaFunction(params):
#Bivariate Normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

Z1= mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)

Z= Z1

return -40*Z

def minimaFunctionDerivative(params):
# Derivative of the bivariate normal function
X,Y= params

sigma11,sigma12,mu11,mu12= (3.0,.5,0.0,0.0)

dZ1X= -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11- X)/sigma11**2
dZ1Y= -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12- Y)/sigma12**2

return (dZ1X,dZ1Y)

def optimize(iterations, oF, dOF,params,learningRate,beta1,beta2):
"""
computes the optimal value of params for a given objective function and its derivative
Arguments:
- iteratoins - the number of iterations required to optimize the objective function
- oF - the objective function
- dOF - the derivative function of the objective function
- params - the parameters of the function to optimize
- learningRate - the learning rate
- beta1 - The weighted moving average parameter for momentum component of ADAM
- beta2 - The weighted moving average parameter for RMSProp component of ADAM
Return:
- oParams - the list of optimized parameters at each step of iteration
"""
oParams= [params]
vdw    = (0.0,0.0)
sdw    = (0.0,0.0)
vdwCorr= (0.0,0.0)
sdwCorr= (0.0,0.0)

eps= 10**(-7)
#The iteration loop
for iin range(iterations):
# Compute the derivative of the parameters
dParams= dOF(params)

# Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar
vdw    = tuple([vDW*beta1+(1.0-beta1)*dParfor dPar,vDWin zip(dParams,vdw)])

# Compute the rms of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2
sdw    = tuple([sDW*beta2+(1.0-beta2)*dPar**2.0 for dPar,sDWin zip(dParams,sdw)])
# Compute the weight boosting for sdw and vdw
vdwCorr= tuple([vDW/(1.0-beta1**(i+1.0))for vDWin vdw])
sdwCorr= tuple([sDW/(1.0-beta2**(i+1.0))for sDWin sdw])

#SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter
params= tuple([par-learningRate*vdwCORR/((sdwCORR**.5)+eps)for sdwCORR,vdwCORR,parin zip(vdwCorr,sdwCorr,params)])

# append the new parameters
oParams.append(params)

return oParams

iterations=100
learningRate= .1
beta1= .9
beta2= .999
x,y= 5.0,1.0
params= (x,y)
optimizedParameters= optimize(iterations,\
minimaFunction,\
minimaFunctionDerivative,\
params,\
learningRate,\
beta1,\
beta2)</div>

– 随机梯度下降算法:100次迭代 – 随机梯度下降算法+Momentum算法:50次迭代 – RMSProp算法:10次迭代 —Adam算法:5次迭代

1600 篇文章85 人订阅

0 条评论

## 相关文章

1754

3919

7866

2879

1133

### 开发 | 计算机视觉中，究竟有哪些好用的目标跟踪算法（下）

VOT2015竞赛 VOT2015 Challenge | Home（http://votchallenge.net/vot2015/） 如期而至，这一年有60...

4886

37111

### 斯坦福CS231n - CNN for Visual Recognition（8）-lecture6学习率更新、超参数调优

训练深度网络时，让学习率随着时间退火通常很有帮助。如果学习率很高，系统的动能就过大，参数向量就会无规律地跳动，不能够稳定到损失函数更深更窄的部分去。

922

1174

### 【干货】不止准确率：为分类任务选择正确的机器学习度量指标（附代码实现）

【导读】本文是数据科学研究者William Koehrsen撰写的技术博文，介绍了在分类模型中需要用到的度量标准。我们知道，准确率是我们在分类任务中最常用到的度...

3537