# sklearn 源码分析系列：neighbors(3)

by DemonSonggithub源码链接(https://github.com/demonSong/DML)

by\space DemonSong\\ github源码链接(https://github.com/demonSong/DML)

Note:

## 实战

### Nearest Neighbors Classification

# 监督学习

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 15

# 导入数据
X = iris.data[:, :2]  # 为了可视化数据，选取两个维度
y = iris.target

h = .02  # step size in the mesh

# 对应于分类器颜色
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
# 对应于数据分类颜色
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
# 初始化分类器
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
# 数据拟合
clf.fit(X, y)

# 生成分类器边界
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# 可视化数据样例
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))

plt.show()

### Nearest Neighbors regression

# 1.6.3 Nearest Neighbors Regression

# generate sample data
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors

np.random.seed(0)

X = np.sort(5 * np.random.rand(40, 1), axis=0)
T = np.linspace(0,5,500)[:,np.newaxis]
y = np.sin(X).ravel()

y[::5] += 1 * (0.5-np.random.rand(8))  # 选取下标为 0 ，5，10，...的元素，随机加入噪声

n_neighbors = 5
for i, weights in enumerate(['uniform','distance']):
knn = neighbors.KNeighborsRegressor(n_neighbors,weights=weights)
y_ = knn.fit(X,y).predict(T)

plt.subplot(2,1,i+1)
plt.scatter(X,y,c='k',label='data')
plt.plot(T,y_,c='g',label='prediction')
plt.axis('tight')
plt.legend()
plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,
weights))
plt.show()

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces
from sklearn.utils.validation import check_random_state

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

data = fetch_olivetti_faces()
targets = data.target

# data.images [400,64,64] -> data [400,4096]
data = data.images.reshape(len(data.images),-1)

train = data[targets <30]
test = data[targets >=30]

# 随机选取5张图做测试
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0],size=(n_faces,)) # 随机产生大小为size的矩阵【row,col】，范围在[0,test.shape[0]）之间
test = test[face_ids,:]

n_pixels = data.shape[1]

# 图的上半部分作为输入向量
X_train = train[:, :np.ceil(0.5 * n_pixels)]  # Upper half of the faces
# 图的下半部分作为输出向量
y_train = train[:, np.floor(0.5 * n_pixels):]  # Lower half of the faces

# 预测输入向量
X_test = test[:, :np.ceil(0.5 * n_pixels)]
# 预测输出向量
y_test = test[:, np.floor(0.5 * n_pixels):]

# 真是个恐怖的实验。。。这些分类器想要根据人脸的上半部分生成人脸的下边部分。

# fit estimators 对比四种分类器分类效果
ESTIMATORS = {
"Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
random_state=0),
"K-nn": KNeighborsRegressor(),
"Linear regression": LinearRegression(),
"Ridge": RidgeCV(),
}

y_test_predict = dict()

for name,estimator in ESTIMATORS.items():
estimator.fit(X_train,y_train)
y_test_predict[name] = estimator.predict(X_test)

# Plot the completed faces
image_shape = (64, 64)

n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators", size=16)

for i in range(n_faces):
true_face = np.hstack((X_test[i], y_test[i]))

if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
title="true faces")

sub.axis("off")
sub.imshow(true_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")

for j, est in enumerate(sorted(ESTIMATORS)):
completed_face = np.hstack((X_test[i], y_test_predict[est][i]))

if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)

else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
title=est)

sub.axis("off")
sub.imshow(completed_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")

plt.show()

## 源码剖析

Created with Raphaël 2.1.0数据X到结构的映射ClientClientNearestNeighborsNearestNeighborsNeighborsBaseNeighborsBaseUnsupervisedMixinUnsupervisedMixinKDTreeKDTree__init__()_init_params()fit()_fit()__init__()_build()

Created with Raphaël 2.1.0查询结果的返回过程ClientClientNearestNeighborsNearestNeighborsKNeighborsMixinKNeighborsMixinKDTreeKDTreek近邻查询kneighbors(X)query(X)查询结果查询结果

• 监督学习的fit()方法和非监督学习的fit()方法不同，一个为fit(X,y)，而非监督为fit(X)
• 非监督学习没有predict()方法。

### 分类问题

clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)

clf.predict(X)
clf.score(X)

KNeighborsClassifier位于Neighbors包下的classification.py文件下。

class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
SupervisedIntegerMixin, ClassifierMixin):
def __init__(self, n_neighbors=5,
weights='uniform', algorithm='auto', leaf_size=30,
p=2, metric='minkowski', metric_params=None, n_jobs=1,
**kwargs):

self._init_params(n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size, metric=metric, p=p,
metric_params=metric_params, n_jobs=n_jobs, **kwargs)
self.weights = _check_weights(weights)

def predict(self, X):
X = check_array(X, accept_sparse='csr')

neigh_dist, neigh_ind = self.kneighbors(X)

classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]

n_outputs = len(classes_)
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)

y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)

mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)

if not self.outputs_2d_:
y_pred = y_pred.ravel()

return y_pred   

class SupervisedIntegerMixin(object):
def fit(self, X, y):

if not isinstance(X, (KDTree, BallTree)):
X, y = check_X_y(X, y, "csr", multi_output=True)

if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
if y.ndim != 1:
warnings.warn("A column-vector y was passed when a 1d array "
"was expected. Please change the shape of y to "
"(n_samples, ), for example using ravel().",
DataConversionWarning, stacklevel=2)

self.outputs_2d_ = False
y = y.reshape((-1, 1))
else:
self.outputs_2d_ = True

check_classification_targets(y)
self.classes_ = []
self._y = np.empty(y.shape, dtype=np.int)
for k in range(self._y.shape[1]):
classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
self.classes_.append(classes)

if not self.outputs_2d_:
self.classes_ = self.classes_[0]
self._y = self._y.ravel()
# 它实质还是调用了NeighborsBase的fit方法
return self._fit(X)

KNeighborsClassifier中还会继承一个整个sklearn.base下的ClassifierMixin，它是一个给分类器的预测效果进行评分的接口，代码位于sklearn包下的base.py中，如下：

class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn."""
_estimator_type = "classifier"

def score(self, X, y, sample_weight=None):
from .metrics import accuracy_score
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

Created with Raphaël 2.1.0参数初始化与数据X拟合ClientClientKNeighborsClassifierKNeighborsClassifierNeighborsBaseNeighborsBaseSupervisedIntegerMixinSupervisedIntegerMixinKDTreeKDTree__init__()_init_params()fit(X，y)_fit(X)__init__()_build()

Created with Raphaël 2.1.0分类预测与打分ClientClientKNeighborsClassifierKNeighborsClassifierKNeighborsMixinKNeighborsMixinKDTreeKDTreeClassifierMixinClassifierMixinpredict(X)k近邻查询query(X)查询结果查询结果预测算法返回预测结果score(X)predict(X)返回预测结果打分算法返回评分结果

### 回归问题

clf = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
clf.fit(X, y)

clf.predict(X)
clf.score(X)

KNeighborsRegressor类在Neighbors包下的regression.py中，它同样有四个父类：

• NeighborsBase：参数初始化，fit(X)的调度。
• KNeighborsMixin：k近邻的查询。
• SupervisedFloatMixin：适配器，fit(X,y)到fit(x)的适配。
• RegressorMixin：回归打分机制。

Created with Raphaël 2.1.0参数初始化与数据X拟合ClientClientKNeighborsRegressorKNeighborsRegressorNeighborsBaseNeighborsBaseSupervisedFloatMixinSupervisedFloatMixinKDTreeKDTree__init__()_init_params()fit(X，y)_fit(X)__init__()_build()

Created with Raphaël 2.1.0分类预测与打分ClientClientKNeighborsRegressorKNeighborsRegressorKNeighborsMixinKNeighborsMixinKDTreeKDTreeRegressorMixinRegressorMixinpredict(X)k近邻查询query(X)查询结果查询结果预测算法返回预测结果score(X)predict(X)返回预测结果打分算法返回评分结果

Ok，整个Neighbors系列的架构算是分析完毕了，没什么特别指出的地方，框架有了，没啥内容，后续将着重算法ball_tree和kd_tree性能对比，来点干货，敬请期待。

0 条评论