无监督学习之均值漂移聚类模型

文章来源：企鹅号 - 用Python

# 建立均值飘逸聚类模型: 用于聚群数据点（优点: 无需提前指定确定聚群的个数）

# 基本原理：算法将数据点的部分看成概率密度函数，通过特征空间中根据函数分别特征找出数据点的模式，即一群群局部最密集。

import numpy as np

from sklearn.cluster import MeanShift,estimate_bandwidth

#---------------数据准备

import tushare as ts

data=ts.get_hist_data('601518',start='2017-010-26',end='2018-06-22',ktype='D')

datay=pd.DataFrame(columns=['滞后两天','滞后一天','当天涨跌情况'],index=range(len(data)-2))

for i in range(2,len(data)):

datay.iloc[i-2,0]=data.iloc[i-2,6]

datay.iloc[i-2,1]=data.iloc[i-1,6]

if data.iloc[i,6]>0:

datay.iloc[i-2,2]=1

else:

datay.iloc[i-2,2]=0

x=np.array(np.array(datay.iloc[:,[0,1]]).tolist())

bandwidth=estimate_bandwidth(x,quantile=0.1,n_samples=len(x))# 设置均值漂移参数

m_estimator=MeanShift(bandwidth=bandwidth,bin_seeding=True)# 计算聚类

m_estimator.fit(x)# 训练均值漂移模型

labels=m_estimator.labels_# 获取标记

cent=m_estimator.cluster_centers_# 提取聚类的中心点位置

num_cluster=len(np.unique(labels))# 计算聚群个数

print('聚群的个数为:',str(num_clumster))# 显示

#------ 将聚群可视化

import matplotlib.pyplot as plt

from itertools import cycle

plt.figure()

markers='.*xv'# 针对不同的群标记不一样的标记

for i,marker,color in zip(range(num_cluster),markers,'rgbk'):# zip函数用于对应组合

# 组合为（0,.）(1,*)(2,x)(3,v)

plt.scatter(x[labels==i,0],x[labels==i,1],marker=marker,color=color,s=30)# 画出前4群点

centr=cent[i]

plt.plot(centr[0],centr[1],marker='o',markerfacecolor='k',markeredgecolor='k',markersize=8)

plt.title('Clusters and their centroids')

plt.show()

（本来有23类，本图只画出了前4类的图）

相关快讯