# ①Aggregation Model

bagging and decision tree

# ②Random Forest

random forest有三个优点，1.决策树可以由不同的主机生成，效率高。2.随机森林继承了CART的有点。3.以bagging的形式结合在一起，避免了过拟合。

subspace

# ③Out-Of-Bag Estimate

1/N就是抽中的概率。那么抽不中就(1 - 1/N)，多次就是N次方了。化简一下约等于1/e。其实这里的≈不太准确，因为N -> lim是有(1 + 1/N)^N = e。 所以，是有大约百分之30的数据是抽不到的。 貌似是和之前学过validation有点像，来对比一下：

# ⑤代码实现

### Random Forest in Action

```  def choose_samples(self, data, k):
'''choose the feature from data
input:data, type = list
output:k
'''
n, d = np.shape(data)
feature = []
for j in range(k):
feature.append(rd.randint(0, d - 2))
index = []
for i in range(n):
index.append(rd.randint(0, n-1))
data_samples = []
for i in range(n):
data_tmp = []
for fea in feature:
data_tmp.append(data[i][fea])
data_tmp.append(data[i][-1])
data_samples.append(data_tmp)
pass
return data_samples, feature
pass```

```  def random_forest(self, data, trees_num):
'''create a forest
input:data, type = list
output:trees_result, trees_feature
'''
decisionTree = tree.decision_tree()
trees_result = []
trees_feature = []
d = np.shape(data)[1]
if d > 2:
k = int(math.log(d - 1, 2)) + 1
else:
k = 1

for i in range(trees_num):
print('The ', i, ' tree. ')
data_samples, feature = self.choose_samples(data, k)
t = decisionTree.build_tree(data_samples)
trees_result.append(t)
trees_feature.append(feature)
pass
return trees_result, trees_feature```

```def split_data(data_train, feature):
'''select the feature from data
input:data, feature
output:data, type = list
'''
m = np.shape(data_train)[0]
data = []
for i in range(m):
data_tmp = []
for x in feature:
data_tmp.append(data_train[i][x])
data_tmp.append(data_train[i][-1])
data.append(data_tmp)
return data

'''use the boston dataset from sklearn'''
data = dataSet.data
target = dataSet.target
for i in range(len(target)):
if target[i] == 0:
target[i] = -1
dataframe = pd.DataFrame(data)
dataframe.insert(np.shape(data)[1], 'target', target)
dataMat = np.mat(dataframe)
X_train, X_test, y_train, y_test =  train_test_split(dataMat[:, 0:-1], dataMat[:, -1], test_size=0.3, random_state=0)
data_train = np.hstack((X_train, y_train))
data_train = data_train.tolist()
X_test = X_test.tolist()

return data_train, X_test, y_test```

```  def get_predict(self, trees_result, trees_feature, data_train):
'''predict the result
input:trees_result, trees_feature, data
output:final_prediction
'''
decisionTree = tree.decision_tree()
m_tree = len(trees_result)
m = np.shape(data_train)[0]
result = []
for i in range(m_tree):
clf = trees_result[i]
feature = trees_feature[i]
data = tool.split_data(data_train, feature)
result_i = []
for i in range(m):
result_i.append( list((decisionTree.predict(data[i][0 : -1], clf).keys()))[0] )
result.append(result_i)
final_predict = np.sum(result, axis = 0)
return final_predict

def cal_correct_rate(self, target, final_predict):
m = len(final_predict)
corr = 0.0
for i in range(m):
if target[i] * final_predict[i] > 0:
corr += 1
pass
return corr/m
pass```

```def running():
'''entrance'''
forest = randomForest()
predic = []
for i in range(1, 20):
trees, features = forest.random_forest(data_train, i)
predictions = forest.get_predict(trees, features, text)
accuracy = forest.cal_correct_rate(target, predictions)
print('The forest has ', i, 'tree', 'Accuracy : ' , accuracy)
predic.append(accuracy)

plt.xlabel('Number of tree')
plt.ylabel('Accuracy')
plt.title('The relationship between tree number and accuracy')
plt.plot(range(1, 20), predic, color = 'orange')
plt.show()
pass

if __name__ == '__main__':
running()```

0 条评论

## 相关文章

45360

20590

16730

### 计算机视觉识别简史：从 AlexNet、ResNet 到 Mask RCNN

【新智元导读】 Medium 用户 Đặng Hà Thế Hiển 制作了一张信息图示，用专业、简洁并且最有吸引力的方式——信息图示，讲述计算机视觉（CV）物...

46880

31860

43980

25050

48290

13910

39890