Python之决策树:预测隐形眼镜类型

定义:决策树是一种树形结构,其中每个内部节点表示一个属性上的测试,每个分支代表一个测试输出,每个叶节点代表一种类别。

算法:此篇主要使用ID3算法:以信息增益度量属性选择,选择分裂后信息增益最大的属性进行分裂。

总结:决策树分类器就像带有终止块的流程图,终止块表示分类结果。

开始处理数据集时,我们首先需要测量集合中数据的不一致性,也就是熵;

然后寻找最优方案划分数据集,直到数据集中的所有数据属于同一分类。

附code:

##决策树的创建过程

from math import log

import operator

##计算香农熵

def calcShannonEnt(dataSet):

numEntries = len(dataSet) #计算数据集中实例的总数。

labelCounts = {}

for featVec in dataSet:

currentLabel = featVec[-1] #创建一个数据字典,它的键值是最后一列的数值.

if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0#如果当前键值不存在,则扩展字典并将当前键值加入字典。

labelCounts[currentLabel] += 1

shannonEnt = 0.0

for key in labelCounts:

prob = float(labelCounts[key])/numEntries#使用所有类标签的发生频率计算类别出现的概率。

shannonEnt -= prob * log(prob,2) #用这个概率计算香农熵 ,统计所有类标签发生的次数

return shannonEnt

##创建数据集

def createDataSet():

dataSet = [[1, 1, 'yes'],

[1, 1, 'yes'],

[1, 0, 'no'],

[0, 1, 'no'],

[0, 1, 'no']]

labels = ['no surfacing','flippers']

#change to discrete values

return dataSet, labels

######################################

myDat,labels=createDataSet()

myDat

labels

calcShannonEnt(myDat)

#熵越高,则混合的数据也越多,我们可以在数据集中添加更多的分类,观察熵是如何变化的。

#这里我们增加第三个名为maybe的分类,测试熵的变化

myDat[0][-1]='maybe'

myDat

calcShannonEnt(myDat)

########################################

##按照给定特征划分数据集

def splitDataSet(dataSet, axis, value):#使用了三个输入参数:待划分的数据集、划分数据集的特征、特征的返回值。

retDataSet = []#创建一个新的列表对象

for featVec in dataSet:#数据集这个列表中的各个元素也是列表,遍历数据集中的每个元素

if featVec[axis] == value:#一旦发现符合要求的值,则将其添加到新创建的列表中。在if语句中,程序将符合特征的数据抽取出来

reducedFeatVec = featVec[:axis]

reducedFeatVec.extend(featVec[axis+1:])

retDataSet.append(reducedFeatVec)

return retDataSet

##########################################

myDat,labels=createDataSet()

myDat

splitDataSet(myDat,0,1)

splitDataSet(myDat,0,0)

##########################################

##选择最好的数据集划分方式

def chooseBestFeatureToSplit(dataSet):

numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels

baseEntropy = calcShannonEnt(dataSet) #计算了整个数据集的原始香农熵

bestInfoGain = 0.0; bestFeature = -1

for i in range(numFeatures): #iterate over all the features

featList = [example[i] for example in dataSet]#create a list of all the examples of this feature

uniqueVals = set(featList) #get a set of unique values

newEntropy = 0.0

for value in uniqueVals:

subDataSet = splitDataSet(dataSet, i, value)

prob = len(subDataSet)/float(len(dataSet))

newEntropy += prob * calcShannonEnt(subDataSet)

infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy

if (infoGain > bestInfoGain): #compare this to the best gain so far

bestInfoGain = infoGain #if better than current best, set to best

bestFeature = i

return bestFeature #returns an integer

##################################################

myDat,labels=createDataSet()

myDat

chooseBestFeatureToSplit(myDat)#第0个特征是最好的用于划分数据集的特征。

##################################################

## 递归构建决策树

def majorityCnt(classList):

classCount={}

for vote in classList:

if vote not in classCount.keys(): classCount[vote] = 0

classCount[vote] += 1

sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)

return sortedClassCount[0][0]

def createTree(dataSet,labels):

classList = [example[-1] for example in dataSet]

if classList.count(classList[0]) == len(classList):

return classList[0]#递归函数的第一个停止条件是所有的类标签完全相同,则直接返回该类标签

if len(dataSet[0]) == 1: #第二个停止条件是使用完了所有特征,仍然不能将数据集划分成仅包含唯一类别的分组

return majorityCnt(classList)

bestFeat = chooseBestFeatureToSplit(dataSet)

bestFeatLabel = labels[bestFeat]

myTree = }

del(labels[bestFeat])

featValues = [example[bestFeat] for example in dataSet]

uniqueVals = set(featValues)

for value in uniqueVals:

subLabels = labels[:] #当前数据集选取的最好特征存储在变量bestFeat中,得到列表包含的所有属性值

myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)

#最后代码遍历当前选择特征包含的所有属性值,在每个数据集划分上递归调用函数createTree(),得到的返回值将被插入到字典变量myTree中,

#因此函数终止执行时,字典中将会嵌套很多代表叶子节点信息的字典数据。

return myTree

#################################################

myDat,labels=createDataSet()

myTree=createTree(myDat,labels)

myTree

#################################################

#测试算法:使用决策树执行分类

def classify(inputTree,featLabels,testVec):

firstStr = inputTree.keys()[0]

secondDict = inputTree[firstStr]

featIndex = featLabels.index(firstStr)

key = testVec[featIndex]

valueOfFeat = secondDict[key]

if isinstance(valueOfFeat, dict):

classLabel = classify(valueOfFeat, featLabels, testVec)

else: classLabel = valueOfFeat

return classLabel

#############################################

myDat,labels=createDataSet()

labels

myTree=retrieveTree(0)

myTree

#使用算法:决策树的存储,使用pickle模块存储决策树

def storeTree(inputTree,filename):

import pickle

fw = open(filename,'w')

pickle.dump(inputTree,fw)

fw.close()

def grabTree(filename):

import pickle

fr = open(filename)

return pickle.load(fr)

####################################################

import pandas as pd

import os

#更改当前工作目录

os.chdir('C:\Users\E440\Desktop\PythonStudy\MLiA_SourceCode\machinelearninginaction\Ch03')

os.getcwd()

storeTree(myTree,'classifierStorage2.txt')

grabTree('classifierStorage2.txt')

%matplotlib作图

import matplotlib.pyplot as plt

##使用文本注解绘制树节点

#定义文本框和箭头模式

decisionNode = dict(boxstyle="sawtooth", fc="0.8")

leafNode = dict(boxstyle="round4", fc="0.8")

arrow_args = dict(arrowstyle="

#绘制带箭头的注解

def plotNode(nodeTxt, centerPt, parentPt, nodeType):

xytext=centerPt, textcoords='axes fraction',

va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )

'''def createPlot():

fig = plt.figure(1, facecolor='white')

fig.clf()

createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses

plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)

plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)

plt.show()

createPlot()'''

##构造注解数

#获取叶节点的数目和树的层数

def getNumLeafs(myTree):

numLeafs = 0

firstStr = myTree.keys()[0]

secondDict = myTree[firstStr]

for key in secondDict.keys():

if type(secondDict[key]).__name__=='dict':#type()函数可以判断子节点是否为字典类型

numLeafs += getNumLeafs(secondDict[key])

#如果子节点是字典类型,则该节点也是一个判断节点,需要递归调用getNumLeafs()函数。

#getNumLeafs()函数遍历整棵树,累计叶子节点的个数,并返回该数值。

else: numLeafs +=1

return numLeafs

#getTreeDepth()计算遍历过程中遇到判断节点的个数

def getTreeDepth(myTree):

maxDepth = 0

firstStr = myTree.keys()[0]

secondDict = myTree[firstStr]

for key in secondDict.keys():

if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes

thisDepth = 1 + getTreeDepth(secondDict[key])

else: thisDepth = 1

if thisDepth > maxDepth: maxDepth = thisDepth

return maxDepth

#plotMidText()计算父节点和子节点的中间位置,并在此处添加简单的文本标签信息

def plotMidText(cntrPt, parentPt, txtString):

xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]

yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]

def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on

numLeafs = getNumLeafs(myTree) #this determines the x width of this tree

depth = getTreeDepth(myTree) #函数plotTree()首先计算树的宽和高

firstStr = myTree.keys()[0] #the text label for this node should be this

cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)

plotMidText(cntrPt, parentPt, nodeTxt) #绘出子节点具有的特征值

plotNode(firstStr, cntrPt, parentPt, decisionNode)

secondDict = myTree[firstStr]

plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD#按比例减少全局变量plotTree.yOff,并标注此处将要绘制子节点(因为我们是自顶向下绘制图形,因此需要依次递减y坐标值)

for key in secondDict.keys():

if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes

plotTree(secondDict[key],cntrPt,str(key)) #recursion

else: #it's a leaf node print the leaf node

plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW

plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)

plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))

plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

#if you do get a dictonary you know it's a tree, and the first element will be another dict

def createPlot(inTree):

fig = plt.figure(1, facecolor='white')

fig.clf()

axprops = dict(xticks=[], yticks=[])

createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #no ticks

#createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses

plotTree.totalW = float(getNumLeafs(inTree))

plotTree.totalD = float(getTreeDepth(inTree))

plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;

plotTree(inTree, (0.5,1.0), '')

plt.show()

######################################################

#函数retrieveTree输出预先存储的树信息

def retrieveTree(i):

listOfTrees=[{'no surfacing':}}},

{'no surfacing':},1:'no'}}}}]#tree的最终结果

return listOfTrees[i]

retrieveTree(1)

myTree=retrieveTree(0)

getNumLeafs(myTree)#叶子节点个数:叶子结点就是度为0的结点 就是没有子结点的结点

getTreeDepth(myTree)#树的层数

createPlot(myTree)

myTree['no surfacing'][3]='maybe'

myTree

createPlot(myTree)

myTree=retrieveTree(1)

createPlot(myTree)

######################################################

###示例:使用决策树预测隐形眼镜类型

fr=open('lenses.txt')

lenses=[inst.strip().split('\t') for inst in fr.readlines()]

lensesLabels=['age','prescript','astigmagic','tearRate']

lensesTree=createTree(lenses,lensesLabels)

lensesTree

createPlot(lensesTree)

  • 发表于:
  • 原文链接http://kuaibao.qq.com/s/20180327G1C7IG00?refer=cp_1026
  • 腾讯「云+社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。

扫码关注云+社区

领取腾讯云代金券