# Python机器学习的练习七：K-Means聚类和主成分分析

## K-Means聚类

```import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

def find_closest_centroids(X, centroids):
m= X.shape[0]
k= centroids.shape[0]
idx= np.zeros(m)

for iin range(m):
min_dist= 1000000
for jin range(k):
dist= np.sum((X[i,:]- centroids[j,:])** 2)
if dist < min_dist:
min_dist= dist
idx[i]= j

return idx```

```data= loadmat('data/ex7data2.mat')
X= data['X']
initial_centroids= initial_centroids= np.array([[3,3], [6,2], [8,5]])

idx= find_closest_centroids(X, initial_centroids)
idx[0:3]
array([0., 2., 1.])```

```def compute_centroids(X, idx, k):
m, n= X.shape
centroids= np.zeros((k, n))

for iin range(k):
indices= np.where(idx== i)
centroids[i,:]= (np.sum(X[indices,:], axis=1)/ len(indices[0])).ravel()

return centroids

compute_centroids(X, idx,3)
array([[2.42830111, 3.15792418],
[5.81350331, 2.63365645],
[7.11938687, 3.6166844 ]])```

```def run_k_means(X, initial_centroids, max_iters):
m, n= X.shape
k= initial_centroids.shape[0]
idx= np.zeros(m)
centroids= initial_centroids

for iin range(max_iters):
idx= find_closest_centroids(X, centroids)
centroids= compute_centroids(X, idx, k)

return idx, centroids

idx, centroids= run_k_means(X, initial_centroids,10)```

```cluster1= X[np.where(idx== 0)[0],:]
cluster2= X[np.where(idx== 1)[0],:]
cluster3= X[np.where(idx== 2)[0],:]

fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3')
ax.legend()```

```def init_centroids(X, k):
m, n= X.shape
centroids= np.zeros((k, n))
idx= np.random.randint(0, m, k)

for iin range(k):
centroids[i,:]= X[idx[i],:]

return centroids

init_centroids(X,3)
array([[1.15354031, 4.67866717],
[6.27376271, 2.24256036],
[2.20960296, 4.91469264]])```

```image_data= loadmat('data/bird_small.mat')
image_data
{'A': array([[[219,180,103],
[230,185,116],
[226,186,110],
...,
[14, 15, 13],
[13, 15, 12],
[12, 14, 12]],
...,
[[15, 19, 19],
[20, 20, 18],
[18, 19, 17],
...,
[65, 43, 39],
[58, 37, 38],
[52, 39, 34]]], dtype=uint8),
'__globals__': [],
'__header__':'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Jun  5 04:06:24 2012',
'__version__':'1.0'}```

`A= image_data['A'] `
`A.shape`
`(128L,128L,3L)`

```# normalize value ranges
A= A/ 255.

# reshape the array
X= np.reshape(A, (A.shape[0]* A.shape[1], A.shape[2]))

# randomly initialize the centroids
initial_centroids= init_centroids(X,16)

# run the algorithm
idx, centroids= run_k_means(X, initial_centroids,10)

# get the closest centroids one last time
idx= find_closest_centroids(X, centroids)

# map each pixel to the centroid value
X_recovered= centroids[idx.astype(int),:]

# reshape to the original dimensions
X_recovered= np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))

plt.imshow(X_recovered)```

## 主成分分析

PCA是一个可以在数据集中找到“主成分”或者最大方差方向的线性变换。它可以用于其他事物的维度减少。在这个练习中，我们需要实现PCA，并将其应用于一个简单的二维数据集，观察它是如何工作的。从加载和可视化数据集开始。

```data= loadmat('data/ex7data1.mat')
X= data['X']

fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])```

PCA的算法相当简单。在保证数据正规化后，输出只是原始数据协方差矩阵的单值分解。由于numpy已经有内置函数来计算矩阵协方差和SVD，我们将利用这些函数而不是从头开始。

```def pca(X):
# normalize the features
X= (X- X.mean())/ X.std()

# compute the covariance matrix
X= np.matrix(X)
cov= (X.T* X)/ X.shape[0]

# perform SVD
U, S, V= np.linalg.svd(cov)

return U, S, V

U, S, V= pca(X)
U, S, V
(matrix([[-0.79241747,-0.60997914],
[-0.60997914, 0.79241747]]),
array([1.43584536, 0.56415464]),
matrix([[-0.79241747,-0.60997914],
[-0.60997914, 0.79241747]]))```

```def project_data(X, U, k):
U_reduced= U[:,:k]
return np.dot(X, U_reduced)

Z= project_data(X, U,1)
Z
matrix([[-4.74689738],
[-7.15889408],
[-4.79563345],
[-4.45754509],
[-4.80263579],
...,
[-6.44590096],
[-2.69118076],
[-4.61386195],
[-5.88236227],
[-7.76732508]])```

```def recover_data(Z, U, k):
U_reduced= U[:,:k]
return np.dot(Z, U_reduced.T)

X_recovered= recover_data(Z, U,1)
X_recovered
matrix([[3.76152442, 2.89550838],
[5.67283275, 4.36677606],
[3.80014373, 2.92523637],
[3.53223661, 2.71900952],
[3.80569251, 2.92950765],
...,
[5.10784454, 3.93186513],
[2.13253865, 1.64156413],
[3.65610482, 2.81435955],
[4.66128664, 3.58811828],
[6.1549641 , 4.73790627]])```

```fig, ax= plt.subplots(figsize=(12,8))
ax.scatter(X_recovered[:,0], X_recovered[:,1])```

```faces= loadmat('data/ex7faces.mat')
X= faces['X']
X.shape
(5000L,1024L)```

```face= np.reshape(X[3,:], (32,32))
plt.imshow(face)```

```U, S, V= pca(X)
Z= project_data(X, U,100)```

```X_recovered= recover_data(Z, U,100)
face= np.reshape(X_recovered[3,:], (32,32))
plt.imshow(face)```

http://www.johnwittenauer.net/machine-learning-exercises-in-python-part-7/

1990 篇文章97 人订阅

0 条评论

## 相关文章

372100

28860

31680

21290

24690

20560

### 【BDTC 2017讲师专访】彭冬：微博商业基础大数据平台（D+）的架构演进

BDTC 2017中国大数据技术大会将于12月7日-9日在北京新云南皇冠假日酒店举行，大会为期三天。届时，近百位技术专家将为现场数千名的大数据行业精英、技术...

28250

44080

27790