写在前面
SVM是一个很庞杂的体系,前面我从以下几个方面分别讨论了SVM的大致原理:
本篇博文主要是对SVM系列博客的一个实践,手写SVM来简单地对指定数据集进行分类。
数据文件:SVM数据集[1],提取码:dfz3
import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
class SMOStruct:
def __init__(self, X, y, C, kernel, toler):
self.N = X.shape[0]
self.X = X # 训练样本集合
self.y = y # 类别 label
self.C = C # regularization parameter惩罚项
self.kernel = kernel # kernel function 核函数
self.b = 0 # scalar bias term
self.E = np.zeros(self.N)
self.toler = toler
self.lambdas = np.zeros(self.N) # lagrange multiplier 拉格朗日乘子,与样本一一相对
self.K = np.zeros((self.N, self.N)) #存储K矩阵(核函数值)
# 训练样本的个数和每个样本的features数量
for i in range(self.N):
for j in range(self.N):
self.K[i, j] = Kernel(X[i].T, X[j].T, kernel)
#加载数据
def load_data(path):
data = pd.read_csv(path, sep='\t', names=[i for i in range(3)])
data = np.array(data).tolist()
x = []; y = []
for i in range(len(data)):
y.append(data[i][-1])
del data[i][-1]
x.append(data[i])
for i in range(len(y)):
if y[i] == 0:
y[i] = -1
x = np.array(x)
y = np.array(y).reshape(len(y), )
train_x = x[0:80]
train_y = y[0:80]
test_x = x[80:len(x)]
test_y = y[80:len(x)]
return x, y, train_x, train_y, test_x, test_y
def Kernel(xi, xj, kernel):
res = 0.0
if kernel[0] == 'linear': #线性核
res = np.dot(xi.T, xj)
elif kernel[0] == 'rbf': #高斯核
sum = 0.0
for i in range(len(xi)):
sum += (xi[i] - xj[i]) ** 2
res = np.exp(-sum / (2 * kernel[1] ** 2))
elif kernel[0] == 'poly': #多项式核
res = np.dot(xi.T, xj)
res = res ** kernel[1]
elif kernel[0] == 'laplace': #拉普拉斯核
sum = 0.0
for i in range(len(xi)):
sum += (xi[i] - xj[i]) ** 2
sum = sum ** 0.5
res = np.exp(-sum / kernel[1])
elif kernel[0] == 'sigmoid': #Sigmoid核
res = np.dot(xi.T, xj)
res = np.tanh(kernel[1] * res + kernel[2])
return res
def kkt(model, i):
Ei = cacEi(model, i)
if ((model.y[i]*Ei < -model.toler) and (model.lambdas[i] < model.C)) or ((model.y[i]*Ei > model.toler) and (model.lambdas[i] > 0)):
return True #违反KKT条件
else:
return False
#选择第一个优化变量
def outerLoop(model):
for i in range(model.N):
if kkt(model, i): #违反kkt条件
return i
#寻找第二个优化变量
def inner_loop(model, i):
E1 = cacEi(model, i)
max_diff = 0
j = None
E = np.nonzero(model.E)[0]
# print(model.E)
if len(E) > 1:
for k in E:
if k == i:
continue
Ek = cacEi(model, k)
diff = np.abs(Ek - E1)
# print("diff",diff)
if diff > max_diff:
max_diff = diff
j = k
else:
j = i
while j == i:
j = int(np.random.uniform(0, model.N))
return j
#计算f(xk)
def calfx(model, k):
sum = 0.0
for i in range(model.N):
sum += (model.lambdas[i] * model.y[i] * model.K[i, k])
sum += model.b
return sum
#计算Ek for i in range(N): lambda_i*y_i*K(i,k)
def cacEi(model, k):
return calfx(model, k) - float(model.y[k])
#核心程序
def update(model, i, j):
print(i, j)
lambdai_old = model.lambdas[i]
lambdaj_old = model.lambdas[j]
if model.y[i] != model.y[j]:
L = max(0.0, model.lambdas[j] - model.lambdas[i])
H = min(model.C, model.C + model.lambdas[j] - model.lambdas[i])
else:
L = max(0.0, model.lambdas[j] + model.lambdas[i] - model.C)
H = min(model.C, model.lambdas[j] + model.lambdas[i])
if L == H:
return 0
eta = model.K[i, i] + model.K[j, j] - 2.0 * model.K[i, j]
if eta <= 0:
return 0
lambdaj_new_unc = lambdaj_old + model.y[j] * (model.E[i] - model.E[j]) / eta
lambdaj_new = clipBonder(L, H, lambdaj_new_unc)
lambdai_new = lambdai_old + model.y[i] * model.y[j] * (lambdaj_old - lambdaj_new)
#更新参数lambda
model.lambdas[i] = lambdai_new
model.lambdas[j] = lambdaj_new
#更新阈值b
b1 = model.b - model.E[i] - model.y[i] * (lambdai_new - lambdai_old) * model.K[i, i] - model.y[j] * (
lambdaj_new - lambdaj_old) * model.K[i, j]
b2 = model.b - model.E[j] - model.y[i] * (lambdai_new - lambdai_old) * model.K[i, j] - model.y[j] * (
lambdaj_new - lambdaj_old) * model.K[j, j]
if model.lambdas[i] > 0 and model.lambdas[i] < model.C:
model.b = b1
elif model.lambdas[j] > 0 and model.lambdas[j] < model.C:
model.b = b2
else:
model.b = (b1 + b2) / 2.0
#更新两个E
model.E[i] = cacEi(model, i)
model.E[j] = cacEi(model, j)
return 1 #更新成功
#计算w
def calW(lambdas, X, y):
m, n = X.shape
w = np.zeros((n, 1))
for i in range(n):
for j in range(m):
w[i] += (lambdas[j] * y[j] * X[j, i])
return w
#裁剪
def clipBonder(L, H, lambda_):
if lambda_ > H:
return H
elif lambda_ <= H and lambda_ >= L:
return lambda_
else:
return L
#smo主程序
def smo_main(C, kernel, toler):
x, y, train_x, train_y, test_x, test_y = load_data('SVM数据集/testSet.txt')
model = SMOStruct(train_x, train_y, C, kernel, toler)
max_step = 50
while max_step > 0:
for i in range(model.N):
if kkt(model, i):
j = inner_loop(model, i)
update(model, i, j)
max_step -= 1
w = calW(model.lambdas, model.X, model.y)
return model, model.lambdas, w, model.b
def plotSVM(model, w):
x, y, train_x, train_y, test_x, test_y = load_data('SVM数据集/testSet.txt')
w1 = w[0, 0]
w2 = w[1, 0]
b = model.b
x0 = [];x1 = []
y0 = [];y1 = []
for i in range(len(x)):
res = np.dot(w.T, x[i, :]) + b
res = np.sign(res)
if res == 1:
x0.append(x[i, 0])
y0.append(x[i, 1])
else:
x1.append(x[i, 0])
y1.append(x[i, 1])
plt.scatter(x0, y0, c='r', marker='*')
plt.scatter(x1, y1, c='blue', marker='*')
plt.legend(['Class_1', 'Class_2'])
x2 = np.linspace(0, 10, 1000)
y2 = -b / w2 - w1 / w2 * x2
plt.plot(x2, y2)
plt.show()
def SVM():
C = 0.6
toler = 0.001
kernel = ['linear', 0.0]
model, lambdas, w, b = smo_main(C, kernel, toler)
x, y, train_x, train_y, test_x, test_y = load_data('SVM数据集/testSet.txt')
# test_x, test_y = load_data('horse_colic/horseColicTest.txt')
sum = 0
for i in range(len(test_y)):
res = np.dot(w.T, test_x[i, :]) + b
res = np.sign(res)
if res == test_y[i]:
sum += 1
print('手写正确率:%.2f%%' % (sum / len(test_y) * 100))
plotSVM(model, w)
def sklearn_svm():
x, y, train_x, train_y, test_x, test_y = load_data('SVM数据集/testSet.txt')
clf = svm.SVC()
clf.fit(train_x, train_y)
print('调包正确率:%.2f%%' % (clf.score(test_x, test_y) * 100))
if __name__ == '__main__':
SVM()
sklearn_svm()
结果展示:
[1]
SVM数据集: https://pan.baidu.com/s/1GDrZ-TGVftcQIzPukaJhoQ
KI的算法杂记
CSDN博客
@Cyril_KI