写在前面
聚类的学习过程总是很快乐的,因为真的太简单了!
初始设定两个值:minPts以及半径r。
import matplotlib.pyplot as plt
minPts = 5 #最小个数
epsilon = 1.0 #半径
color = ['red', 'black', 'blue', 'orange']
visited = []
C = [] #保存最终的聚类结果
noise = [] #噪声点
x = []
y = []
data = open('聚类数据集/dataset.txt')
for line in data.readlines():
x.append(float(line.strip().split('\t')[0]))
y.append(float(line.strip().split('\t')[1]))
for i in range(len(x)): #初始化标记数组
visited.append(False)
def judge(): #判断是否还存在核心点未被标记
for i in range(len(x)):
if visited[i]:
continue
cnt, lis = countObject(x, y, i)
if cnt >= minPts:
return True
return False
def select(): #选择一个没被标记的点
for i in range(len(visited)):
if not visited[i]:
return i
return -1
def countObject(x, y, p): #计算点p邻域的内点的个数
cnt = 0
lis = []
for i in range(len(x)):
if i == p:
continue
if (x[i] - x[p]) ** 2 +(y[i] - y[p]) ** 2 <= epsilon ** 2:
cnt += 1
lis.append(i)
return cnt, lis
def check(c):
for i in c:
if visited[i]:
continue
cnt, lis = countObject(x,y , i)
if cnt >= minPts:
return True
return False
def dbscan():
while judge(): #判断是否还存在核心点未被标记
p = select() #选择一个没被访问的点
visited[p] = True
cnt, lis = countObject(x, y, p)
if cnt >= minPts:
c = []
c.append(p)
for i in lis:
c.append(i)
while(check(c)): #至少有一个点没被访问且该点领域内至少minPts个点
for i in c:
if not visited[i]:
visited[i] = True
cnt1, lis1 = countObject(x, y, i)
if cnt >= minPts:
for j in lis1:
c.append(j)
C.append(c)
for i in range(len(visited)):
if not visited[i]:
noise.append(i)
return C
if __name__ == '__main__':
cluster = dbscan()
X = []
Y = []
for i in noise:
X.append(x[i])
Y.append(y[i])
plt.scatter(X, Y, c='m', marker='D') # 噪声点
plt.legend(['noise'])
for i in range(len(cluster)):
X = []
Y = []
for j in cluster[i]:
X.append(x[j])
Y.append(y[j])
plt.scatter(X, Y, c=color[i], alpha=1, s=50)
plt.title('dbscan')
plt.show()