模型树

用户6021899

发布于 2019-08-19 10:33:52

4570

发布于 2019-08-19 10:33:52

文章被收录于专栏：Python编程 pyqt matplotlibPython编程 pyqt matplotlib

用树来对数据建模，除了把叶节点简单地设为常数值外，还可以把叶节点设为分段线性函数。后者就可以称之为模型树。如下图中的数据集，如用回归树拟合，势必使树的结构非常复杂。如若用模型树拟合，则两个分支足矣。

模型树的大部分代码都和回归树相同，仅仅将叶节点从常数标量改为权重系数行向量。所以相应的多了用最小二乘法求回归系数的过程，以及其它的一些必要修改。

加载数据集和切分数据集：

from numpy import *
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine) )#map all elements to float() # py36
        dataMat.append(fltLine)
    return dataMat
    
def binSplitDataSet(dataSet, feature, value):
    #这里做了修改，左小右大更符合惯例 2019.8.13
    matLeft = dataSet[nonzero(dataSet[:,feature] < value)[0],:]
    matRight = dataSet[nonzero(dataSet[:,feature] >= value)[0],:]
    return matLeft, matRight

求回归系数和误差：

#模型树
def linearSolve(dataSet):   #helper function used in two places
    m,n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y
    
def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws
    
def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws # 预测值
    return sum(power(Y - yHat,2)) #依然是真实值和预测值之差的平方和

选择最佳切分和创建树，代码和回归树的一样：

def chooseBestSplit(dataSet, leafType, errType, ops=(0.5, 4)):
    # tolS : 容许的误差下降值
    # tolN：切分的最少样本数
    tolS, tolN = ops
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
        return None, leafType(dataSet)
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(array(dataSet[:,featIndex]).flatten().tolist()): # 利用集合去重，set()参数列表不能有嵌套，须先降维
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                           #and the value used for that split

def createTree(dataSet, leafType=modelLeaf, errType=modelErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

求解出了模型树ops=(1.0, 10))：

myData = loadDataSet('exp2.txt')
myMat = mat(myData)
modeTree = createTree(myMat, leafType=modelLeaf, errType=modelErr, ops=(1.0, 10))
print(modeTree)
print("树的深度：%d，叶子节点数：%d" %(getTreeDepth(modeTree),getNumLeafs(modeTree)))

下面利用训练好的模型树进行预测：

def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1,n+1))) # 数据集第一个特征全部设为1
    X[:,1:n+1]=inDat
    return float(X*model)
    
def isTree(obj):
    return (type(obj).__name__=='dict')
    
def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] < tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) #递归
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) #递归
        else: return modelEval(tree['right'], inData)
       
def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat #预测值的结果矩阵

X = myMat[:, 0:-1]
Y=  myMat[:, -1]
Yhat = createForeCast(modeTree, X , modelEval = modelTreeEval)#预测值

有了预测值，通过计算相关系数我们可以衡量模型树的拟合效果：

#测试集上 预测值和真实值的相关系数（皮尔逊相关系数)
R2 = corrcoef(Y, Yhat, rowvar =0) [0,1] #numpy模块
print("训练集上预测值与真实值间的皮尔逊相关系数为 %f" % R2)

>>>预测值与真实值间的皮尔逊相关系数为 0.999377

最后，我们以linspace函数创建虚拟测试集，用测试集上预测值画出分段拟合的直线：

from matplotlib import pyplot as plt
testData = linspace(0,1,1000) #from numpy
#myData = loadDataSet('ex2test.txt')
myArray = array(myData)
plt.scatter(myArray[:,0], myArray[:,-1],s=25, color ="b")#myArray[:,0] 全为常数1
yHat = createForeCast(modeTree, testData, modelEval = modelTreeEval) #预测值
plt.plot(testData,yHat,lw =2, c="r")
plt.xlabel("x")
plt.ylabel("y")
plt.title("模型树分段线性拟合", fontsize =16, color ="k")
plt.show()

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2019-08-13，如有侵权请联系 cloudcommunity@tencent.com 删除

python

本文分享自 Python可视化编程机器学习OpenCV 微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

python

登录后参与评论

0 条评论

热度

模型树

模型树

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐