用树来对数据建模,除了把叶节点简单地设为常数值外,还可以把叶节点设为分段线性函数。后者就可以称之为模型树。如下图中的数据集,如用回归树拟合,势必使树的结构非常复杂。如若用模型树拟合,则两个分支足矣。
模型树的大部分代码都和回归树相同,仅仅将叶节点从常数标量改为权重系数行向量。所以相应的多了 用最小二乘法求回归系数的过程,以及其它的一些必要修改。
加载数据集和切分数据集:
from numpy import *
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine) )#map all elements to float() # py36
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
#这里做了修改,左小右大更符合惯例 2019.8.13
matLeft = dataSet[nonzero(dataSet[:,feature] < value)[0],:]
matRight = dataSet[nonzero(dataSet[:,feature] >= value)[0],:]
return matLeft, matRight
求回归系数和误差:
#模型树
def linearSolve(dataSet): #helper function used in two places
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws,X,Y
def modelLeaf(dataSet):#create linear model and return coeficients
ws,X,Y = linearSolve(dataSet)
return ws
def modelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X * ws # 预测值
return sum(power(Y - yHat,2)) #依然是真实值和预测值之差的平方和
选择最佳切分和创建树,代码和回归树的一样:
def chooseBestSplit(dataSet, leafType, errType, ops=(0.5, 4)):
# tolS : 容许的误差下降值
# tolN:切分的最少样本数
tolS, tolN = ops
#if all the target variables are the same value: quit and return value
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
return None, leafType(dataSet)
m,n = shape(dataSet)
#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set(array(dataSet[:,featIndex]).flatten().tolist()): # 利用集合去重,set()参数列表不能有嵌套,须先降维
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the split
if (S - bestS) < tolS:
return None, leafType(dataSet) #exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
return None, leafType(dataSet)
return bestIndex,bestValue#returns the best feature to split on
#and the value used for that split
def createTree(dataSet, leafType=modelLeaf, errType=modelErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
if feat == None: return val #if the splitting hit a stop condition return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
求解出了模型树ops=(1.0, 10)):
myData = loadDataSet('exp2.txt')
myMat = mat(myData)
modeTree = createTree(myMat, leafType=modelLeaf, errType=modelErr, ops=(1.0, 10))
print(modeTree)
print("树的深度:%d,叶子节点数:%d" %(getTreeDepth(modeTree),getNumLeafs(modeTree)))
下面利用训练好的模型树进行预测:
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1,n+1))) # 数据集第一个特征全部设为1
X[:,1:n+1]=inDat
return float(X*model)
def isTree(obj):
return (type(obj).__name__=='dict')
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree): return modelEval(tree, inData)
if inData[tree['spInd']] < tree['spVal']:
if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) #递归
else: return modelEval(tree['left'], inData)
else:
if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) #递归
else: return modelEval(tree['right'], inData)
def createForeCast(tree, testData, modelEval=regTreeEval):
m=len(testData)
yHat = mat(zeros((m,1)))
for i in range(m):
yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat #预测值的结果矩阵
X = myMat[:, 0:-1]
Y= myMat[:, -1]
Yhat = createForeCast(modeTree, X , modelEval = modelTreeEval)#预测值
有了预测值,通过计算相关系数我们可以衡量模型树的拟合效果:
#测试集上 预测值和真实值的相关系数(皮尔逊相关系数)
R2 = corrcoef(Y, Yhat, rowvar =0) [0,1] #numpy模块
print("训练集上预测值与真实值间的皮尔逊相关系数为 %f" % R2)
>>>预测值与真实值间的皮尔逊相关系数为 0.999377
最后,我们以linspace函数创建虚拟测试集,用测试集上预测值 画出分段拟合的直线:
from matplotlib import pyplot as plt
testData = linspace(0,1,1000) #from numpy
#myData = loadDataSet('ex2test.txt')
myArray = array(myData)
plt.scatter(myArray[:,0], myArray[:,-1],s=25, color ="b")#myArray[:,0] 全为常数1
yHat = createForeCast(modeTree, testData, modelEval = modelTreeEval) #预测值
plt.plot(testData,yHat,lw =2, c="r")
plt.xlabel("x")
plt.ylabel("y")
plt.title("模型树分段线性拟合", fontsize =16, color ="k")
plt.show()
本文分享自 Python可视化编程机器学习OpenCV 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体分享计划 ,欢迎热爱写作的你一起参与!