# 聚类算法

### 1.欧氏距离：

p=2时就说平时计算的几何距离，当p趋向于正无穷的时候，其实求的就不是x,y的距离了，而是求x y中最长的一个了。因为如果x大于y，在指数增长下x回远大于y，所以y会被忽略的。这也是比较常用的了。

image

### 4.Pearson相似度：

image

pearson相似度和cos相似度其实很像，其实就是平移到原点求余弦

### 5.相对熵：

image

==================================================================================================================

Kmeans均值算法：

1.选定k个类别中心点，u1,u2......un

2.对于每个样本点，选定离他最近的那个类别作为一个类别。

3.将类别中心更新为所有样本点的均值

4.重复步骤，只到均值点不再更新为止。

### 首先是数据的准备：

```import numpy as np

import matplotlib.pyplot as plt

import seaborn as sea

import pandas as pd

import random

df.drop('编号' , axis=1 , inplace=True)

return df```

1,0.697,0.46

2,0.774,0.376

3,0.634,0.264

4,0.608,0.318

5,0.556,0.215

6,0.403,0.237

7,0.481,0.149

8,0.437,0.211

9,0.666,0.091

10,0.243,0.267

11,0.245,0.057

12,0.343,0.099

13,0.639,0.161

14,0.657,0.198

15,0.36,0.37

16,0.593,0.042

17,0.719,0.103

18,0.359,0.188

19,0.339,0.241

20,0.282,0.257

21,0.784,0.232

22,0.714,0.346

23,0.483,0.312

24,0.478,0.437

25,0.525,0.369

26,0.751,0.489

27,0.532,0.472

28,0.473,0.376

29,0.725,0.445

30,0.446,0.459

```dataFile = read_file('Data.txt')

data = dataFile.values

mean_data = np.mean(data , axis=0)

data -= mean_data

data = list(data)

k = 3

mean_vectors = random.sample(data , k)

'''

'''

def dist(p1 , p2):

return np.sqrt(sum(np.square(p1 - p2)))```

### 聚类过程：

```while True:

clusters = list(map((lambda x : [x]) , mean_vectors))

for sample in data:

distances = list(map((lambda m : dist(sample , m)) , mean_vectors))

min_indexs = distances.index(min(distances))

clusters[min_indexs].append(sample)

new_mean_vectors = []

for c , v in zip(clusters , mean_vectors):

new_mean_vector = sum(c)/len(c)

if all(np.divide((new_mean_vector-v),v) < np.array([0.0001,0.0001]) ):

new_mean_vectors.append(v)

else:

new_mean_vectors.append(new_mean_vector)

if np.array_equal(mean_vectors,new_mean_vectors):

break

else:

mean_vectors = new_mean_vectors

print(mean_vectors)

print(new_mean_vectors)

pass```

`clusters = list(map((lambda x : [x]) , mean_vectors))`

```  for sample in data:

distances = list(map((lambda m : dist(sample , m)) , mean_vectors))

min_indexs = distances.index(min(distances))

clusters[min_indexs].append(sample)

new_mean_vectors = []```

```  for c , v in zip(clusters , mean_vectors):

new_mean_vector = sum(c)/len(c)

if all(np.divide((new_mean_vector-v),v) < np.array([0.0001,0.0001]) ):

new_mean_vectors.append(v)

else:

new_mean_vectors.append(new_mean_vector)

if np.array_equal(mean_vectors,new_mean_vectors):

break

else:

mean_vectors = new_mean_vectors

print(mean_vectors)```

```total_colors = ['r','y','g','b','c','m','k']

shapes = ['p','o','v','^','s']

colors = random.sample(total_colors,k)

index = 0

x = list(map((lambda x : x[0]) , mean_vectors))

y = list(map((lambda x : x[1]) , mean_vectors))

for cluster,color in zip(clusters,colors):

density = list(map(lambda arr:arr[0],cluster))

sugar_content = list(map(lambda arr:arr[1],cluster))

plt.scatter(density,sugar_content,c = color,marker=shapes[index])

index += 1

plt.scatter(x , y , c = 'blue' , marker='x')

plt.show()```

image

image

Kmeans的公式化解释：

image

Kmeans方法总结：

### 层次聚类方法：

python实现：

```import numpy as np

import sklearn.datasets as datasets

import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

import math

from sklearn.decomposition import PCA

def get_data():

data = df.data

label = df.target

data_matrix = np.matrix(data)

label_matrix = np.matrix(label)

return data , label

def ou_distance(vec1 , vec2):

distance = 0.0

for a, b in zip(vec1, vec2):

distance += math.pow(a - b, 2)

return math.sqrt(distance)

pass```

```class Node(object):

def __init__(self ,

vec=None ,

left = None ,

right = None ,

distance = -1 ,

id = None ,

count = 1):

self.vec = vec

self.left = left

self.right = right

self.distance = distance

self.id = id

self.count = count

pass```

```class Hierarchical(object):

def __init__(self , k = 1):

assert k > 0

self.k = k

self.labels = None

self.nodesList = []

pass

def fit(self , x):

#一开始一个点就是一个类别

nodes = [Node(vec=v , id=i) for i ,v in enumerate(x)]

self.nodesList.append(nodes.copy())

distance = {}

point_num , features_num = np.shape(x)

self.labels = [-1] * point_num

currentClusterid = -1

while len(nodes) > self.k:

if len(nodes) == 30:

self.nodesList.append(nodes.copy())

elif len(nodes) == 20:

self.nodesList.append(nodes.copy())

elif len(nodes) == 15:

self.nodesList.append(nodes.copy())

elif len(nodes) == 10:

self.nodesList.append(nodes.copy())

elif len(nodes) == 5:

self.nodesList.append(nodes.copy())

min_distance = math.inf

nodes_lenght = len(nodes)

closet_part = None

for i in range(nodes_lenght - 1):

for j in range(i + 1 , nodes_lenght):

d_key = (nodes[i].id , nodes[j].id)

if d_key not in distance:

distance[d_key] = ou_distance(nodes[i].vec , nodes[j].vec)

d = distance[d_key]

if d < min_distance:

min_distance = d

closet_part = (i , j)

part1 , part2 = closet_part

node1 , node2 = nodes[part1] , nodes[part2]

new_vec = [(node1.vec[i]*node1.count + node2.vec[i]*node2.count)/

(node1.count + node2.count) for i in range(features_num)]

new_node = Node(vec=new_vec ,

left=node1,

right=node2,

distance=min_distance,

id=currentClusterid,

count=node1.count + node2.count)

currentClusterid -= 1

del nodes[part2], nodes[part1]

nodes.append(new_node)

self.nodes = nodes

self.nodesList.append(self.nodes.copy())

self.calc_label()

pass

nodes = [Node(vec=v , id=i) for i ,v in enumerate(x)]

self.nodesList.append(nodes.copy())

distance = {}

point_num , features_num = np.shape(x)

self.labels = [-1] * point_num

currentClusterid = -1```

```      while len(nodes) > self.k:

if len(nodes) == 30:

self.nodesList.append(nodes.copy())

elif len(nodes) == 20:

self.nodesList.append(nodes.copy())

elif len(nodes) == 15:

self.nodesList.append(nodes.copy())

elif len(nodes) == 10:

self.nodesList.append(nodes.copy())

elif len(nodes) == 5:

self.nodesList.append(nodes.copy())

while()里面这段都是画图准备的。不用管。

min_distance = math.inf

nodes_lenght = len(nodes)

closet_part = None

for i in range(nodes_lenght - 1):

for j in range(i + 1 , nodes_lenght):

d_key = (nodes[i].id , nodes[j].id)

if d_key not in distance:

distance[d_key] = ou_distance(nodes[i].vec , nodes[j].vec)

d = distance[d_key]

if d < min_distance:

min_distance = d

closet_part = (i , j)```

```          part1 , part2 = closet_part

node1 , node2 = nodes[part1] , nodes[part2]

new_vec = [(node1.vec[i]*node1.count + node2.vec[i]*node2.count)/

(node1.count + node2.count) for i in range(features_num)]

new_node = Node(vec=new_vec ,

left=node1,

right=node2,

distance=min_distance,

id=currentClusterid,

count=node1.count + node2.count)

currentClusterid -= 1

print(currentClusterid)

del nodes[part2], nodes[part1]

nodes.append(new_node)

self.nodes = nodes

self.nodesList.append(self.nodes.copy())

self.calc_label()

pass```

```  def calc_label(self):

for i, node in enumerate(self.nodes):

self.leaf_traversal(node, i)

def leaf_traversal(self, node: Node, label):

if node.left == None and node.right == None:

self.labels[node.id] = label

if node.left:

self.leaf_traversal(node.left, label)

if node.right:

self.leaf_traversal(node.right, label)```

```pca = PCA(n_components=2)

pca.fit(data)

PCA(copy=True, n_components=2, whiten=False)

data = pca.transform(data)

def leaf_print(node , j):

if node.left == None or node.right == None:

plt.scatter(data[node.id][0] , data[node.id][1] , color = colors[j] , marker=markers[j])

if node.left:

leaf_print(node.left , j)

if node.right:

leaf_print(node.right , j)

def draw(nodes):

for i , j in zip(nodes , range(len(nodes))):

leaf_print(i,(j+1)%len(markers))

plt.title(len(nodes))

plt.show()```

```colors = {

'burlywood':            '#DEB887',

'chartreuse':          '#7FFF00',

'chocolate':            '#D2691E',

'cornflowerblue':      '#6495ED',

'cornsilk':            '#FFF8DC',

'crimson':              '#DC143C',

'cyan':                '#00FFFF',

'darkblue':            '#00008B',

'darkcyan':            '#008B8B',

'darkgoldenrod':        '#B8860B',

'darkgray':            '#A9A9A9',

'darkgreen':            '#006400',

'darkkhaki':            '#BDB76B',

'darkmagenta':          '#8B008B',

'darkolivegreen':      '#556B2F',

'darkorange':          '#FF8C00',

'darkorchid':          '#9932CC',

'darkred':              '#8B0000',

'darksalmon':          '#E9967A',

'darkseagreen':        '#8FBC8F',

'darkslateblue':        '#483D8B',

'darkslategray':        '#2F4F4F',

'darkturquoise':        '#00CED1',

'darkviolet':          '#9400D3',

'deeppink':            '#FF1493',

'deepskyblue':          '#00BFFF',

'dimgray':              '#696969',

'dodgerblue':          '#1E90FF',

'firebrick':            '#B22222',

'floralwhite':          '#FFFAF0',

'forestgreen':          '#228B22',

'fuchsia':              '#FF00FF',

'gainsboro':            '#DCDCDC',

'ghostwhite':          '#F8F8FF',

'gold':                '#FFD700',

'goldenrod':            '#DAA520',

'gray':                '#808080',

'green':                '#008000',

'honeydew':            '#F0FFF0',

'hotpink':              '#FF69B4',

'indianred':            '#CD5C5C',

'indigo':              '#4B0082',

'ivory':                '#FFFFF0',

'khaki':                '#F0E68C',

'lavender':            '#E6E6FA',

'lavenderblush':        '#FFF0F5',

'lawngreen':            '#7CFC00',

'lemonchiffon':        '#FFFACD',

'mediumpurple':        '#9370DB',

'mediumseagreen':      '#3CB371',

'mediumslateblue':      '#7B68EE',

'mediumspringgreen':    '#00FA9A',

'mediumturquoise':      '#48D1CC',

'mediumvioletred':      '#C71585',

'midnightblue':        '#191970',

'mintcream':            '#F5FFFA',

'mistyrose':            '#FFE4E1',

'moccasin':            '#FFE4B5',

'navy':                '#000080',

'oldlace':              '#FDF5E6',

'olive':                '#808000',

'olivedrab':            '#6B8E23',

'orange':              '#FFA500',

'orangered':            '#FF4500',

'orchid':              '#DA70D6',

'palegoldenrod':        '#EEE8AA',

'palegreen':            '#98FB98',

'paleturquoise':        '#AFEEEE',

'palevioletred':        '#DB7093',

'papayawhip':          '#FFEFD5',

'peachpuff':            '#FFDAB9',

'peru':                '#CD853F',

'pink':                '#FFC0CB',

'plum':                '#DDA0DD',

'powderblue':          '#B0E0E6',

'purple':              '#800080',

'red':                  '#FF0000',

'rosybrown':            '#BC8F8F',

'royalblue':            '#4169E1',

'salmon':              '#FA8072',

'sandybrown':          '#FAA460',

'seagreen':            '#2E8B57',

'seashell':            '#FFF5EE',

'sienna':              '#A0522D',

'silver':              '#C0C0C0',

'skyblue':              '#87CEEB',

'slateblue':            '#6A5ACD',

'slategray':            '#708090',

'snow':                '#FFFAFA',

'springgreen':          '#00FF7F',

'steelblue':            '#4682B4',

'tan':                  '#D2B48C',

}

colors = list(colors.values())```

`markers = ['.' , ',' , 'o' , 'v' , '^', '<' , '>' , '1','2','3','4','s' , 'p' , '*' , 'h' , 'H' , 'd' , '|'  , '+' , 'x']`

```my = Hierarchical(4)

data , label = get_data()

my.fit(data)```

```for nodes in my.nodesList:

draw(nodes)

pass```

image

image

image

image

image

image

image

image

==================================================================================================================================

### DBSCAN密度聚类算法：

image

image

python实现：

```import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

import sklearn.datasets as datasets

import math

def get_data():

dataset = []

for d in data:

d1 = (d[0] , d[1] , d[2] , d[3])

dataset.append(d1)

return dataset```

```#计算欧氏距离

def dist(vec1 , vec2):

distance = 0.0

for a, b in zip(vec1, vec2):

distance += math.pow(a - b, 2)

return math.sqrt(distance)

pass```

```def DBSCAN(D, e, Minpts):

#初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P,

T = set()

k = 0

C = []

P = set(D)

for d in D:

if len([ i for i in D if dist(d, i) <= e]) >= Minpts:

#开始聚类

while len(T):

P_old = P

#随机选择一个核心对象

o = list(T)[np.random.randint(0, len(T))]

Q = []

#加入集合

Q.append(o)

while len(Q):

q = Q[0]

Nq = [i for i in D if dist(q, i) <= e]

if len(Nq) >= Minpts:

S = P & set(Nq)

Q += (list(S))

P = P - S

Q.remove(q)

k += 1

Ck = list(P_old - P)

T = T - set(Ck)

C.append(Ck)

return C , P```

```  for d in D:

if len([ i for i in D if dist(d, i) <= e]) >= Minpts:

```  while len(T):

P_old = P

#随机选择一个核心对象

o = list(T)[np.random.randint(0, len(T))]

Q = []

#加入集合 Q集合是待选择的密度可达核心点

Q.append(o)

while len(Q):

q = Q[0]

Nq = [i for i in D if dist(q, i) <= e]要计算里面的点是不是核心点，因为里面存的不仅仅是核心点，而是核心点周围的点

if len(Nq) >= Minpts:是核心点

S = P & set(Nq)p是没有被选择的点 set(Nq是核心对象周围的点) 那么S就是类别了。

Q += (list(S))重复上诉步骤，看看周围的点哪些是核心对象点，把他周围的点拉进来，在不断扩张

P = P - S选出去了，可以不要了

Q.remove(q)用了  不要了

k += 1

Ck = list(P_old - P)剩下的就是被选取的

T = T - set(Ck)这些核心点已经被用过了，可以丢弃，要不永远都不会结束的

C.append(Ck)分出来的一个类别```

`C , P= DBSCAN(data, 0.6, 4)`

C是分类，P是噪音点，没有被选择的

```from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca.fit(data)

data2 = pca.transform(data)

def Draw(C):

for c , i in zip(C , range(len(C))):

for s in c:

s = data.index(s)

s = data2[s]

plt.scatter(s[0] , s[1] , color = colors[i])

pass

for i in P:

i = data.index(i)

i = data2[i]

plt.scatter(i[0] , i[1] , color = 'black')

pass

plt.show()

pass```

`Draw(C)`

image

image

==================================================================================================================================

### 谱聚类

image

image

image

image

image

image

python实现：

```import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

import sklearn.datasets as dataset

import seaborn as sns

import math

def pca(data):

pca = PCA(n_components=2)

pca.fit(data)

return pca.transform(data)

def get_data():

data = pca(data)

return data

data = get_data()

def dist(vec1 , vec2):

distance = 0.0

for a, b in zip(vec1, vec2):

distance += math.pow(a - b, 2)

return math.sqrt(distance)```

```def getWbyKNN(data , K):

point_num = len(data)

dis_num = np.zeros((point_num , point_num))

W = np.zeros((point_num , point_num))

for i in range(point_num):

for j in range(i+1 , point_num):

dis_num[i][j] = dis_num[j][i] = dist(data[i] , data[j])

for idx , each in enumerate(dis_num):

index_distance = np.argsort(each)

W[idx][index_distance[1 : K+1]] = 1

tmp_W = np.transpose(W)

W = (tmp_W + W)/2

return W

pass```

```#得到度矩阵

def getD(W):

point_num = len(W)

D = np.diag(np.zeros(point_num))

for i in range(point_num):

D[i][i] = sum(W[i])

return D

pass```

```#得到游走拉普拉斯矩阵 I - D-1W

def getLPLSmatrix(W , D):

D = np.linalg.inv(D)

I = np.eye(len(W[0]) , 1)

return (I - D.dot(W))

pass```

```W = getWbyKNN(data , 5)

D = getD(W)

L = getLPLSmatrix(W , D)```

```def getEigVec(L,cluster_num):  #从拉普拉斯矩阵获得特征矩阵

eigval,eigvec = np.linalg.eig(L)

dim = len(eigval)

dictEigval = dict(zip(eigval,range(0,dim)))

kEig = np.sort(eigval)[0:cluster_num]

ix = [dictEigval[k] for k in kEig]

return eigval[ix],eigvec[:,ix]

eigval , eigvec = getEigVec(L , 5)```

```clk = KMeans(4)

clk.fit(eigvec)

C = clk.labels_```

image

```colors = ['r' , 'b' , 'g' , 'y']

markers = ['x' , '^' , '+' , 's']

def draw(data , label):

for i , j in zip(data , label):

plt.scatter(i[0] , i[1] , color = colors[j] , marker=markers[j])

plt.show()

pass```

image

0 条评论

## 相关文章

1944

4635

3.7K1

1413

4666

### Python：SMOTE算法

17.11.28更新一下：最近把这个算法集成到了数据预处理的python工程代码中了，不想看原理想直接用的，有简易版的python开发：特征工程代码模版 ，进...

2224

### 教你从零开始在 TensorFlow 上搭建 RNN（完整代码）！

RNN 是什么? 递归神经网络，或者说 RNN，在数据能被按次序处理、数据点的不同排列亦会产生影响时就可以使用它。更重要的是，该次序可以是任意长度。 最直接...

3726

1923

1643

### 【Unity3d游戏开发】游戏中的贝塞尔曲线以及其在Unity中的实现

RT，马三最近在参与一款足球游戏的开发，其中涉及到足球的各种运动轨迹和路径，比如射门的轨迹，高吊球，香蕉球的轨迹。最早的版本中马三是使用物理引擎加力的方式实...

5091