前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >K-Means聚类算法C++实现

K-Means聚类算法C++实现

作者头像
用户9831583
发布2022-06-16 14:28:53
4560
发布2022-06-16 14:28:53
举报
文章被收录于专栏:码出名企路

说明:将10组列表分为4类

测试函数

代码语言:javascript
复制
#include <iostream>
#include "KMeans.h"
using namespace std;

int main()
{
    double data[] = {
        0.0, 0.2, 0.4,
        0.3, 0.2, 0.4,
        0.4, 0.2, 0.4,
        0.5, 0.2, 0.4,
        5.0, 5.2, 8.4,
        6.0, 5.2, 7.4,
        4.0, 5.2, 4.4,
        10.3, 10.4, 10.5,
        10.1, 10.6, 10.7,
        11.3, 10.2, 10.9
    };

    const int size = 10; //样本数目
    const int dim = 3;   //三维矩阵
    const int cluster_num = 4; //分成4类

    KMeans* kmeans = new KMeans(dim,cluster_num);//初始化构造函数
    int* labels = new int[size];
    kmeans->SetInitMode(KMeans::InitUniform);
  kmeans->Cluster(data,size,labels);

  for(int i = 0; i < size; ++i)
  {
      printf("%f, %f, %f belongs to %d cluster\n", data[i*dim+0], data[i*dim+1], data[i*dim+2], labels[i]);
  }

  delete []labels;
  delete kmeans;

    return 0;
}

函数声明

代码语言:javascript
复制

#pragma once
#include <fstream>
#include <string.h>
#include <assert.h>

class KMeans
{
public:

  enum InitMode
  {
    InitRandom,
    InitManual,
    InitUniform,
  };

  KMeans(int dimNum = 1, int clusterNum = 1);
  ~KMeans();
  
  void SetInitMode(int i){ m_initMode = i; }

  void Init(double *data, int N);
  void Cluster(double *data, int N, int *Label);

private:
  int m_dimNum;
  int m_clusterNum;
  double** m_means;

  int m_initMode;
  int m_maxIterNum;  
  double m_endError;    

  double GetLabel(const double* x, int* label);
  double CalcDistance(const double* x, const double* u, int dimNum);
};

具体实现

代码语言:javascript
复制

#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include "KMeans.h"
using namespace std;


KMeans::KMeans(int dimNum, int clusterNum)
{
  m_dimNum = dimNum;
  m_clusterNum = clusterNum;

  m_means = new double*[m_clusterNum];//1*4的数组 []

  for (int i = 0; i < m_clusterNum; i++)
  {
    m_means[i] = new double[m_dimNum];
    memset(m_means[i], 0, sizeof(double) * m_dimNum);
  }

  m_initMode = InitRandom;
  m_maxIterNum = 100;
  m_endError = 0.001;
}

KMeans::~KMeans()
{
  for (int i = 0; i < m_clusterNum; i++)
  {
    delete[] m_means[i];
  }
  delete[] m_means;
}


void KMeans::Cluster(double *data, int N, int *Label)
{
  int size = 0;
  size=N;

  assert(size >= m_clusterNum);

  // 初始化模型
  Init(data,N);

  // 循环递归
  double* x = new double[m_dimNum];  // 样本数据
  int label = -1;    // 类别索引
  double iterNum = 0;
  double lastCost = 0;
  double currCost = 0;
  int unchanged = 0;
  bool loop = true;
  int* counts = new int[m_clusterNum];
  double** next_means = new double*[m_clusterNum];  // 再次分类新的模型
  //初始化新模型
  for (int i = 0; i < m_clusterNum; i++)
  {
    next_means[i] = new double[m_dimNum];
  }
    
  //迭代次数
  while (loop)
  {
    memset(counts, 0, sizeof(int) * m_clusterNum);
    for (int i = 0; i < m_clusterNum; i++)
    {
      memset(next_means[i], 0, sizeof(double) * m_dimNum);
    }

    lastCost = currCost;
    currCost = 0;

    // 进行初次分类
    for (int i = 0; i < size; i++)
    {
      for(int j = 0; j < m_dimNum; j++)
        x[j] = data[i*m_dimNum+j];

      currCost += GetLabel(x, &label);

      counts[label]++;
      for (int d = 0; d < m_dimNum; d++)
      {
        next_means[label][d] += x[d];
      }
    }
    currCost /= size;//平均距离,为再次分类做准备

    // 重新分类
    for (int i = 0; i < m_clusterNum; i++)
    {
      if (counts[i] > 0)
      {
        for (int d = 0; d < m_dimNum; d++)
        {
          next_means[i][d] /= counts[i];
        }
        memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum);
      }
    }

    // 迭代终止条件
    iterNum++;
    if (fabs(lastCost - currCost) < m_endError * lastCost)
    {
      unchanged++;
    }
    if (iterNum >= m_maxIterNum || unchanged >= 3)
    {
      loop = false;
    }

    
    cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl;
  }

  // 跳出迭代,输出结果
  for (int i = 0; i < size; i++)
  {
    for(int j = 0; j < m_dimNum; j++)
        x[j] = data[i*m_dimNum+j];
    GetLabel(x, &label);
    Label[i] = label;
  }
  delete[] counts;
  delete[] x;
  for (int i = 0; i < m_clusterNum; i++)
  {
    delete[] next_means[i];
  }
  delete[] next_means;
}

//初始化模型
void KMeans::Init(double *data, int N)
{
  int size = N;

  if (m_initMode ==  InitRandom)
  {
    int inteval = size / m_clusterNum;//每个分类里有几组
    double* sample = new double[m_dimNum];

    // Seed the random-number generator with current time
    srand((unsigned)time(NULL));

    for (int i = 0; i < m_clusterNum; i++)
    {
      int select = inteval * i + (inteval - 1) * rand() / RAND_MAX;
      for(int j = 0; j < m_dimNum; j++)
        sample[j] = data[select*m_dimNum+j];
      memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
    }

    delete[] sample;
  }
  else if (m_initMode == InitUniform)
  {
    double* sample = new double[m_dimNum];

    for (int i = 0; i < m_clusterNum; i++)
    {
      int select = i * size / m_clusterNum;
      for(int j = 0; j < m_dimNum; j++)
        sample[j] = data[select*m_dimNum+j];
      memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
    }

    delete[] sample;
  }
  else if (m_initMode == InitManual)
  {
    // Do nothing
  }
}

//返回分类标准,以距离为准
double KMeans::GetLabel(const double* sample, int* label)
{
  double dist = -1;
  for (int i = 0; i < m_clusterNum; i++)
  {
    double temp = CalcDistance(sample, m_means[i], m_dimNum);
    if (temp < dist || dist == -1)
    {
      dist = temp;
      *label = i;
    }
  }
  return dist;
}

//计算街区距离
double KMeans::CalcDistance(const double* x, const double* u, int dimNum)
{
  double temp = 0;
  for (int d = 0; d < dimNum; d++)
  {
    temp += (x[d] - u[d]) * (x[d] - u[d]);
  }
  return sqrt(temp);
}


博主:菜鸟程序员

初衷:学习资料,程序设计,视觉算法,求职经验,工作心得

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2020-02-07,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 码出名企路 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档