# 决策树算法选购玩具车

# -*-coding: utf-8 -*-

"""

Author:vincentqiao

"""

frommath import log

importoperator

defcalcShannonEnt(dataSet):

numEntries = len(dataSet)

labelCounts = {}

for featVec in dataSet:

currentLabel = featVec[-1]

if currentLabel not inlabelCounts.keys():

labelCounts[currentLabel] = 0

labelCounts[currentLabel] += 1

shannonEnt = 0.0

for key in labelCounts:

prob =float(labelCounts[key])/numEntries

shannonEnt -= prob * log(prob,2)

return shannonEnt

defcreateDataSet():

dataSet = [

['?','HW','Blue','Car','no'],

['N','HW','Red','Car','no'],

['?','SK','Red','Car','yes'],

['Y','SK','Mix','SUV','yes'],

['?','SK','Mix','Truck','yes'],

['N','SK','Blue','SUV','no'],

['N','SK','Mix','SUV','no'],

['?','HW','Red','Truck','no']

]

labels = ['Metal','Brand','Color','Model']

return dataSet, labels

defsplitDataSet(dataSet, axis, value):

retDataSet = []

for featVec in dataSet:

if featVec[axis] == value:

reducedFeatVec = featVec[:axis]

reducedFeatVec.extend(featVec[axis+1:])

retDataSet.append(reducedFeatVec)

return retDataSet

defchooseBestFeatureToSplit(dataSet):

numFeatures = len(dataSet[0]) - 1

baseEntropy = calcShannonEnt(dataSet)

bestInfoGain = 0.0; bestFeature = -1

for i in range(numFeatures):

featList = [example[i] for example indataSet]

uniqueVals = set(featList)

newEntropy = 0.0

for value in uniqueVals:

subDataSet = splitDataSet(dataSet,i, value)

prob =len(subDataSet)/float(len(dataSet))

newEntropy += prob *calcShannonEnt(subDataSet)

infoGain = baseEntropy - newEntropy

print("I=%s, Feature=%s,InfoGain=%s"%(i,labels[i],infoGain))

if (infoGain > bestInfoGain):

bestInfoGain = infoGain

bestFeature = i

print('BestFeature=%s'%(labels[bestFeature]))

return bestFeature

defmajorityCnt(classList):

classCount={}

for vote in classList:

if vote not in classCount.keys():

classCount[vote] = 0

classCount[vote] += 1

sortedClassCount =sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)

return sortedClassCount[0][0]

def createTree(dataSet,labels):

classList = [example[-1] for example indataSet]

if classList.count(classList[0]) ==len(classList):

return classList[0]

if len(dataSet[0]) == 1:

return majorityCnt(classList)

bestFeat = chooseBestFeatureToSplit(dataSet)

bestFeatLabel = labels[bestFeat]

myTree = }

del(labels[bestFeat])

featValues = [example[bestFeat] for examplein dataSet]

uniqueVals = set(featValues)

for value in uniqueVals:

subLabels = labels[:]

myTree[bestFeatLabel][value] =createTree(splitDataSet\

(dataSet, bestFeat, value),subLabels)

return myTree

myDat,labels = createDataSet()

myTree =createTree(myDat,labels)

print(myTree)

BestFeature=Metal

I=0,Feature=Brand, InfoGain=1.0

I=1,Feature=Color, InfoGain=0.5

I=2,Feature=Model, InfoGain=0.0

BestFeature=Brand

{'Metal':{'Y': 'yes', 'N': 'no', '?': {'Brand': {'SK': 'yes', 'HW': 'no'}}}}

Patrick Winston MIT AI OpenCourse

Peter Harrington《机器学习实战》

• 发表于:
• 原文链接：http://kuaibao.qq.com/s/20180201G01E8A00?refer=cp_1026

2018-07-02

2018-06-01

2019-02-18

2019-02-18

2019-02-18

2019-02-18