对于数据集非线性可分的情况,要使用SVM,必须先用核函数将数据从低维空间映射到高维空间,转化成易于分离器理解的形式。核函数并不仅仅应用于SVM,很多其它的机器学习算法也会用到核函数。
径向基函数是SVM中常用的一类核函数。径向基函数是一个采用向量作为自变量的函数,能够基于向量距离运算出一个标量。这个距离可以是从零向量或者其它向量开始计算的距离。本篇我们会用到径向基函数的高斯版本,其公式为:
σ是用户定义的用于确定到达率(reach)或者说函数值跌落到零的速度参数。上述高斯核函数将数据从其特征空间映射到更高维的空间,具体说来这里是映射到了一个无穷维的空间。
具体算法实现的代码如下:
from numpy import *
def loadDataSet(fileName):
#加载训练集
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]), float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
def kernelTrans(X, A, kTup): #calc the kernel or transform data to a higher dimensional space
m,n = shape(X)
K = mat(zeros((m,1)))
if kTup[0]=='lin': K = X * A.T #linear kernel
elif kTup[0]=='rbf':
for j in range(m):
deltaRow = X[j,:] - A
K[j] = deltaRow*deltaRow.T
K = exp(K/(-1*kTup[1]**2)) #divide in NumPy is 元素相除 not matrix like Matlab
else: raise NameError('Houston We Have a Problem -- That Kernel is not recognized')
return K
class optStruct:
#用对象存储数据
def __init__(self,dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m,1)))
self.b = 0
self.eCache = mat(zeros((self.m,2))) #误差缓存。1st 列为1时表示有效(计算好了误差)
self.K = mat(zeros((self.m,self.m)))
for i in range(self.m):
self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)
def calcEk(oS, k):
#预测值的计算 和 非核函数版不同
fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b) #预测值
Ek = fXk - float(oS.labelMat[k]) #误差(预测值减真值)
return Ek
def selectJrand(i,m):
#随机选择一个不等于i的j值
j=i
while (j==i):
j = int(random.uniform(0,m))
return j
def selectJ(i, oS, Ei):
#通过最大化步长的方式选择j (即选择第2个alpha)
maxK = -1
maxDeltaE = 0 # 用于缓存最大误差,用尽可小的值做初始值
Ej = 0
oS.eCache[i] = [1,Ei] #误差缓存。1st 列为1时表示有效(计算好了误差)
validEcacheList = nonzero(oS.eCache[:,0].A)[0] #返回非零误差缓存对应的行索引数组
if (len(validEcacheList)) > 1:
for k in validEcacheList: #循环找到最大的delta E
if k == i:
continue #don't calc for i, waste of time
Ek = calcEk(oS, k)
deltaE = abs(Ei - Ek)
if (deltaE > maxDeltaE):
maxK = k
maxDeltaE = deltaE
Ej = Ek
return maxK, Ej
else: #validEcacheList 为空,表示第一次循环。则随机选择不同于i的j
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)
return j, Ej
def updateEk(oS, k):#任何alpha改变后更新新值到误差缓存
Ek = calcEk(oS, k)
oS.eCache[k] = [1,Ek]
def clipAlpha(aj,H,L):
#切一切alphaj,使其限制在 L和 H之间
if aj > H:
aj = H
if L > aj:
aj = L
return aj
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
j,Ej = selectJ(i, oS, Ei) #和简化版不同
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L==H:
return 0
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
if eta >= 0:
return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j) #更新到误差缓存
if (abs(oS.alphas[j] - alphaJold) < 0.00001):
#print ("j not moving enough")
return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#ai和aj变化量大小相等
updateEk(oS, i) #更新到误差缓存,方向相反
#b1、b2 的更新和 非核函数版不同
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
else: oS.b = (b1 + b2)/2.0
return 1
else: return 0
def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)): #full Platt SMO
oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
iter_ = 0
entireSet = True; alphaPairsChanged = 0
while (iter_ < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
alphaPairsChanged = 0
if entireSet: #go over all
for i in range(oS.m):
alphaPairsChanged += innerL(i,oS)
#print( "fullSet, iter: %d i:%d, pairs changed %d" % (iter_,i,alphaPairsChanged))
iter_ += 1
else:#go over non-bound (railed) alphas
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i,oS)
#print ("non-bound, iter: %d i:%d, pairs changed %d" % (iter_,i,alphaPairsChanged))
iter_ += 1
if entireSet:
entireSet = False #toggle entire set loop
elif (alphaPairsChanged == 0):
entireSet = True
print ("iteration number: %d" % iter_)
return oS.b,oS.alphas
def classfy(Xi, sVs,labelSV,alphaSV, b, kTup): #做分类预测,返回+1或-1
kernelEval = kernelTrans(sVs, Xi, kTup)
y = kernelEval.T * multiply(labelSV,alphaSV) + b
return sign(y)
下面是测试和2d绘图的代码(注意,本篇的算法适用于多维特征的数据集,但其中的绘图函数只适用于2d特征):
def testRbf(k1=1.3):
dataArr,labelArr = loadDataSet('testSetRBF.txt') # 训练集
kTup=('rbf', k1)
b,alphas = smoP(dataArr, labelArr, C=20, toler =0.0001, maxIter=100, kTup=kTup) #C=200 important
datMat=mat(dataArr)
labelMat = mat(labelArr).transpose()
#支持向量以外的数据被舍弃,不参与预测
svIndex=nonzero(alphas.A>0)[0] # Row index of alpha >0
sVs=datMat[svIndex] #get matrix of only support vectors
labelSV = labelMat[svIndex] # label of support vectors
alphaSV = alphas[svIndex]
print ("%d Support Vectors" % shape(sVs)[0])
m,n = shape(datMat)
errorCount = 0
for i in range(m):
if classfy(datMat[i,:], sVs,labelSV,alphaSV, b, kTup) !=sign(labelArr[i]):
errorCount += 1
print ("training error rate is: %.2f%%" % (100*float(errorCount)/m))
plot(datMat, sVs,labelSV,alphaSV, b, kTup, radius=0.05,title ='Prediction and Support Vectors Circled on training set',set_="training")
dataArr,labelArr = loadDataSet('testSetRBF2.txt') #测试集
errorCount = 0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)
for i in range(m):
if classfy(datMat[i,:], sVs,labelSV,alphaSV, b, kTup) !=sign(labelArr[i]):
errorCount += 1
print ("test error rate is: %.2f%%" % (100*float(errorCount)/m) )
plot(datMat, sVs,labelSV,alphaSV, b, kTup, radius=0.05,title ='Prediction on test set',set_="test")
def plot(datMat, sVs,labelSV,alphaSV, b, kTup, radius=0.05,title ='Support Vectors Circled', set_="training"):
# 2D绘图仅适用于2个特征的数据集
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
m, n = datMat.shape
if n!=2 :
raise Error('dataset dimension error, column number does not equal to 2')
xcord0, ycord0 = [], []
xcord1, ycord1 = [], []
for i in range(m): #all sample
if classfy(datMat[i,:], sVs,labelSV,alphaSV, b, kTup) == 1:#label ==1
xcord0.append(float(datMat[i,0]))
ycord0.append(float(datMat[i,1]))
else: #label == -1
xcord1.append(float(datMat[i,0]))
ycord1.append(float(datMat[i,1]))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord0,ycord0, marker='s', s=25, c='green',label ="label=+1")
ax.scatter(xcord1,ycord1, marker='o', s=25, c='red', label ="label=-1")
plt.xlabel("X0")
plt.ylabel("X1")
plt.title(title)
plt.legend()
if set_ =="training":
for sv in sVs:
#radius is radius of circle
circle = Circle((sv[0,0], sv[0,1]), radius, facecolor='none', edgecolor='blue', lw=1, alpha=0.5)
ax.add_patch(circle)
plt.show()
testRbf(k1 =0.3) # k1为高斯核 中的 sigma,可人为调参
结果如下:
本文分享自 Python可视化编程机器学习OpenCV 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!