DBSCAN聚类算法

一、理论

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法。DBSCAN通过将数据点视为空间中的一个个"对象"，并计算它们之间的距离和密度来实现聚类。

在DBSCAN中，对于每个数据点，如果它周围（半径R内）的点数大于某个阈值，则将其划分为核心点；如果它周围的点数不足以满足阈值，但它本身位于某个核心点的邻域中，则将其划分为边界点；否则，将其划分为噪声点。基于这些点的分类，DBSCAN通过连接核心点来形成聚类。具体来说，对于每个核心点，将其周围的所有核心点组成一个簇，并将每个边界点分配给最近的核心点所属的簇。最终得到的簇就是聚类的结果。

DBSCAN相比于传统的K-means等聚类算法具有以下优势：

不需要预先指定聚类数量，可以自动发现数据的聚类结构。
能够处理任意形状的聚类，不受输入数据的分布情况限制。
可以有效地过滤噪声点，提高聚类的准确性。
对于密集区域的聚类效果更好，能够识别不同密度的子簇。

DBSCAN的缺点是在处理高维数据时会受到"维数灾难"的影响，且对于不同密度之间的边界聚类容易出现错误。因此，在实际应用中，需要结合具体场景和数据特点进行选择和优化。

二、代码

参考：

https://github.com/james-yoo/DBSCAN

dbscan.h

#ifndef DBSCAN_H

#define DBSCAN_H

#include <vector>

#include <cmath>

#define UNCLASSIFIED -1

#define CORE_POINT 1

#define BORDER_POINT 2

#define NOISE -2

#define SUCCESS 0

#define FAILURE -3

using namespace std;

typedef struct Point_

{

float x, y, z; // X, Y, Z position

int clusterID; // clustered ID

}Point;

class DBSCAN {

public:

DBSCAN(unsigned int minPts, float eps, vector<Point> points){

m_minPoints = minPts;

m_epsilon = eps;

m_points = points;

m_pointSize = points.size();

}

~DBSCAN(){}

int run();

vector<int> calculateCluster(Point point);

int expandCluster(Point point, int clusterID);

inline double calculateDistance(const Point& pointCore, const Point& pointTarget);

int getTotalPointSize() {return m_pointSize;}

int getMinimumClusterSize() {return m_minPoints;}

int getEpsilonSize() {return m_epsilon;}

public:

vector<Point> m_points;

private:

unsigned int m_pointSize;

unsigned int m_minPoints;

float m_epsilon;

};

#endif // DBSCAN_H

dbscan.cpp

#include "dbscan.h"

int DBSCAN::run()

{

int clusterID = 1;

vector<Point>::iterator iter;

for(iter = m_points.begin(); iter != m_points.end(); ++iter)

{

if ( iter->clusterID == UNCLASSIFIED )

{

if ( expandCluster(*iter, clusterID) != FAILURE )

{

clusterID += 1;

}

return 0;

}

int DBSCAN::expandCluster(Point point, int clusterID)

{

vector<int> clusterSeeds = calculateCluster(point);

if ( clusterSeeds.size() < m_minPoints )

{

point.clusterID = NOISE;

return FAILURE;

}

else

{

int index = 0, indexCorePoint = 0;

vector<int>::iterator iterSeeds;

for( iterSeeds = clusterSeeds.begin(); iterSeeds != clusterSeeds.end(); ++iterSeeds)

{

m_points.at(*iterSeeds).clusterID = clusterID;

if (m_points.at(*iterSeeds).x == point.x && m_points.at(*iterSeeds).y == point.y && m_points.at(*iterSeeds).z == point.z )

{

indexCorePoint = index;

}

++index;

}

clusterSeeds.erase(clusterSeeds.begin()+indexCorePoint);

for( vector<int>::size_type i = 0, n = clusterSeeds.size(); i < n; ++i )

{

vector<int> clusterNeighors = calculateCluster(m_points.at(clusterSeeds[i]));

if ( clusterNeighors.size() >= m_minPoints )

{

vector<int>::iterator iterNeighors;

for ( iterNeighors = clusterNeighors.begin(); iterNeighors != clusterNeighors.end(); ++iterNeighors )

{

if ( m_points.at(*iterNeighors).clusterID == UNCLASSIFIED || m_points.at(*iterNeighors).clusterID == NOISE )

{

if ( m_points.at(*iterNeighors).clusterID == UNCLASSIFIED )

{

clusterSeeds.push_back(*iterNeighors);

n = clusterSeeds.size();

}

m_points.at(*iterNeighors).clusterID = clusterID;

}

return SUCCESS;

}

vector<int> DBSCAN::calculateCluster(Point point)

{

int index = 0;

vector<Point>::iterator iter;

vector<int> clusterIndex;

for( iter = m_points.begin(); iter != m_points.end(); ++iter)

{

if ( calculateDistance(point, *iter) <= m_epsilon )

{

clusterIndex.push_back(index);

}

index++;

}

return clusterIndex;

}

inline double DBSCAN::calculateDistance(const Point& pointCore, const Point& pointTarget )

{

return pow(pointCore.x - pointTarget.x,2)+pow(pointCore.y - pointTarget.y,2)+pow(pointCore.z - pointTarget.z,2);

}

main.cpp

#include <stdio.h>

#include <iostream>

#include "dbscan.h"

#define MINIMUM_POINTS 4 // minimum number of cluster

#define EPSILON (0.75*0.75) // distance for clustering, metre^2

void readBenchmarkData(vector<Point>& points)

{

// load point cloud

FILE *stream;

stream = fopen ("benchmark_hepta.dat","ra");

unsigned int minpts, num_points, cluster, i = 0;

double epsilon;

fscanf(stream, "%u\n", &num_points);

Point *p = (Point *)calloc(num_points, sizeof(Point));

while (i < num_points)

{

fscanf(stream, "%f,%f,%f,%d\n", &(p[i].x), &(p[i].y), &(p[i].z), &cluster);

p[i].clusterID = UNCLASSIFIED;

points.push_back(p[i]);

++i;

}

free(p);

fclose(stream);

}

void printResults(vector<Point>& points, int num_points)

{

int i = 0;

printf("Number of points: %u\n"

" x y z cluster_id\n"

"-----------------------------\n"

, num_points);

while (i < num_points)

{

printf("%5.2lf %5.2lf %5.2lf: %d\n",

points[i].x,

points[i].y, points[i].z,

points[i].clusterID);

++i;

}

int main()

{

vector<Point> points;

// read point data

readBenchmarkData(points);

// constructor

DBSCAN ds(MINIMUM_POINTS, EPSILON, points);

// main loop

ds.run();

// result of DBSCAN algorithm

printResults(ds.m_points, ds.getTotalPointSize());

return 0;

}