# 使用scikit-learn解释随机森林算法

### 用treeinterpreter分解随机森林预测

[py] view plaincopy

1. from treeinterpreter import treeinterpreter as ti
2. from sklearn.tree import DecisionTreeRegressor
3. from sklearn.ensemble import RandomForestRegressor
4. import numpy as np
7. rf = RandomForestRegressor()
8. rf.fit(boston.data[:300], boston.target[:300])

[py] view plaincopy

1. instances = boston.data[[300, 309]]
2. print "Instance 0 prediction:", rf.predict(instances[0])
3. print "Instance 1 prediction:", rf.predict(instances[1])

Instance 0 prediction: [ 30.76] Instance 1 prediction: [ 22.41]

[py] view plaincopy

1. prediction, bias, contributions = ti.predict(rf, instances)

[py] view plaincopy

1. for i in range(len(instances)):
2. print "Instance", i
3. print "Bias (trainset mean)", biases[i]
4. print "Feature contributions:"
5. for c, feature in sorted(zip(contributions[i],
6. boston.feature_names),
7. key=lambda x: -abs(x[0])):
8. print feature, round(c, 2)
9. print "-"*20

Instance 0 Bias (trainset mean) 25.2849333333 Feature contributions: RM 2.73 LSTAT 1.71 PTRATIO 1.27 ZN 1.04 DIS -0.7 B -0.39 TAX -0.19 CRIM -0.13 RAD 0.11 INDUS 0.06 AGE -0.02 NOX -0.01 CHAS 0.0

--------------------

Instance 1 Bias (trainset mean) 25.2849333333 Feature contributions: RM -4.88 LSTAT 2.38 DIS 0.32 AGE -0.28 TAX -0.23 CRIM 0.16 PTRATIO 0.15 B -0.15 INDUS -0.14 CHAS -0.1 ZN -0.05 NOX -0.05 RAD -0.02

[py] view plaincopy

1. print prediction
2. print biases + np.sum(contributions, axis=1)

[ 30.76 22.41] [ 30.76 22.41]

### 比较两个数据集

• 理解造成两个数据集预测值差异的真正原因，比如是什么因素导致相邻两幢房屋的预测价值差异。
• 调试模型和数据，例如解释为什么新数据的平均预测值和旧数据的不一样。

[py] view plaincopy

1. ds1 = boston.data[300:400]
2. ds2 = boston.data[400:]
3. print np.mean(rf.predict(ds1))
4. print np.mean(rf.predict(ds2))

22.1912 18.4773584906

[py] view plaincopy

1. prediction1, bias1, contributions1 = ti.predict(rf, ds1)
2. prediction2, bias2, contributions2 = ti.predict(rf, ds2)

[py] view plaincopy

1. totalc1 = np.mean(contributions1, axis=0)
2. totalc2 = np.mean(contributions2, axis=0)

[py] view plaincopy

1. print np.sum(totalc1 - totalc2)
2. print np.mean(prediction1) - np.mean(prediction2)

3.71384150943 3.71384150943

[py] view plaincopy

1. for c, feature in sorted(zip(totalc1 - totalc2,
2. boston.feature_names), reverse=True):
3. print feature, round(c, 2)

LSTAT 2.8 CRIM 0.5 RM 0.5 PTRATIO 0.09 AGE 0.08 NOX 0.03 B 0.01 CHAS -0.01 ZN -0.02 RAD -0.03 INDUS -0.03 TAX -0.08 DIS -0.14

### 分类树和森林

[py] view plaincopy

1. from sklearn.ensemble import RandomForestClassifier
4. rf = RandomForestClassifier(max_depth = 4)
5. idx = range(len(iris.target))
6. np.random.shuffle(idx)
7. rf.fit(iris.data[idx][:100], iris.target[idx][:100])

[py] view plaincopy

1. instance = iris.data[idx][100:101]
2. print rf.predict_proba(instance)

[py] view plaincopy

1. prediction, bias, contributions = ti.predict(rf, instance)
2. print "Prediction", prediction
3. print "Bias (trainset prior)", bias
4. print "Feature contributions:"
5. for c, feature in zip(contributions[0],
6. iris.feature_names):
7. print feature, c

Prediction [[ 0. 0.9 0.1]] Bias (trainset prior) [[ 0.36 0.262 0.378]] Feature contributions: sepal length (cm) [-0.1228614 0.07971035 0.04315104] sepal width (cm) [ 0. -0.01352012 0.01352012] petal length (cm) [-0.11716058 0.24709886 -0.12993828] petal width (cm) [-0.11997802 0.32471091 -0.20473289]

821 篇文章187 人订阅

0 条评论

## 相关文章

873

### 深度学习笔记-神经网络介绍

---- Deep Learning 学习笔记-第一周 Andrew Ng发布了新课程，业界评价很好。在看的过程中非常不错，我把一些重要的知识和要点进行了总结和...

3254

1001

3083

2198

1162

1967

702

2212

3605