TVP

# 周末AI课堂 非参模型进阶代码篇：机器学习你会遇到的“坑”

import matplotlib.pyplot as pltimport seaborn as sns

from sklearn import datasets

X,y=datasets.make_moons(200,noise=0.2,random_state=0)

sns.set(style='darkgrid')

for i,v,l in [[0,'r','class_0'],[1,'b','class_1']]:

plt.scatter(X[y==i][:,0],X[y==i][:,1],c=v,label=l)

plt.legend()

plt.show()

.......clf=DTC(criterion='entropy')

clf.fit(X,y)

dot_data=tree.export_graphviz(clf, out_file=None,

feature_names=['F_1','F_2'],

class_names=['class_0','class_2'],

filled=True, rounded=True,

special_characters=True)

graph=graphviz.Source(dot_data)

graph.render('MOON')

.....

• 树的深度。随着划分次数的增加，空间被越分越小，预测值也会越来越精确，另一方面，递归的次数也越多，树也越深。

• 叶节点的数目。叶节点就是决策树最末端的节点，它和树的深度一起表示了树的复杂程度。

• 叶节点包含的样本个数。如果它包含的样本太少，说明决策树为少量的样本创建了规则，强行使某一个样本被划分正确，很有可能就出现了过拟合。

import numpy as npimport matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import cross_validate

from sklearn import datasets

from sklearn.tree import DecisionTreeClassifier as DTC

from sklearn import tree

X,y=datasets.make_moons(1000,noise=0.2,random_state=0)

test_mse=[]

train_mse=[]

numbers=range(1,20)

for d in numbers:

clf =DTC(criterion='entropy',min_samples_leaf=d)

clf_dict=cross_validate(clf,X,y,cv=10,scoring='accuracy')

test_mse.append(clf_dict['test_score'].mean())

train_mse.append(clf_dict['train_score'].mean())

sns.set(style='darkgrid')

plt.plot(depths,train_mse,'b-',label='Train Accuracy')

plt.plot(depths,test_mse,'r-',label='Test Accuracy')

plt.xlabel(' minimum number of samples required to be at a leaf node')

plt.ylabel('Accuracy')

plt.legend()

plt.show()

import numpy as npfrom sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt

rng = np.random.RandomState(1)

X = np.linspace(0, 6, 100)[:, np.newaxis]

y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])

regr_1 = DecisionTreeRegressor(max_depth=2)

regr_2 = DecisionTreeRegressor(max_depth=5)

regr_3 = DecisionTreeRegressor(max_depth=10)

regr_1.fit(X, y)

regr_2.fit(X, y)

regr_3.fit(X, y)

y_1 = regr_1.predict(X)

y_2 = regr_2.predict(X)

y_3 = regr_3.predict(X)

plt.figure()

plt.scatter(X, y, s=20, edgecolor="black",

c="darkorange", label="data")

plt.plot(X, y_1, color="cornflowerblue",

label="max_depth=2", linewidth=2)

plt.plot(X, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)

plt.plot(X,y_3, color='r',label='max_depth=10',linewidth=2)

plt.xlabel("data")

plt.ylabel("target")

plt.title("Decision Tree Regression")

plt.legend()

plt.show()

import numpy as npfrom sklearn.tree import DecisionTreeRegressor as DTR

import matplotlib.pyplot as plt

from sklearn import datasets

from sklearn.model_selection import cross_validate

import seaborn as sns

rng = np.random.RandomState(1)

X = np.linspace(0, 6, 100)[:, np.newaxis]

y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])

test_mse=[]

train_mse=[]

depths=range(2,5)

for d in depths:

clf =DTR(criterion='mse',max_depth=d)

clf_dict=cross_validate(clf,X,y,cv=10,scoring='neg_mean_squared_error')

test_mse.append(np.abs(clf_dict['test_score'].mean()))

train_mse.append(np.abs(clf_dict['train_score'].mean()))

sns.set(style='darkgrid')

plt.plot(depths,train_mse,'b-',label='Train MSE')

plt.plot(depths,test_mse,'r-',label='Test MSE')

plt.xlabel(' Max Depth')

plt.ylabel('MSE')

plt.legend()

plt.show()

......clf =DTC(criterion='entropy',min_samples_leaf=8)......

1• 回归树中的度量选取MSE，也可以选取很多回归任务中的性能指标，比如MAE，而不用担心有些性能指标难以优化的问题，因为在回归树中，所定义的优化函数的目的只是用来比较，进一步来确定划分点。

•2 树的深度、叶节点的数目、叶节点包含的样本个数对应着剪枝过程，但这三者并非是相互独立的，往往限制了其中一个，另一个也会被限制，所以在实际使用过程中只需要限制其中一个。

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20180819A1DO7300?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

2023-10-04

Get大咖技术交流圈