一、理论
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法。DBSCAN通过将数据点视为空间中的一个个"对象",并计算它们之间的距离和密度来实现聚类。
在DBSCAN中,对于每个数据点,如果它周围(半径R内)的点数大于某个阈值,则将其划分为核心点;如果它周围的点数不足以满足阈值,但它本身位于某个核心点的邻域中,则将其划分为边界点;否则,将其划分为噪声点。基于这些点的分类,DBSCAN通过连接核心点来形成聚类。具体来说,对于每个核心点,将其周围的所有核心点组成一个簇,并将每个边界点分配给最近的核心点所属的簇。最终得到的簇就是聚类的结果。
DBSCAN相比于传统的K-means等聚类算法具有以下优势:
- 不需要预先指定聚类数量,可以自动发现数据的聚类结构。
- 能够处理任意形状的聚类,不受输入数据的分布情况限制。
- 可以有效地过滤噪声点,提高聚类的准确性。
- 对于密集区域的聚类效果更好,能够识别不同密度的子簇。
DBSCAN的缺点是在处理高维数据时会受到"维数灾难"的影响,且对于不同密度之间的边界聚类容易出现错误。因此,在实际应用中,需要结合具体场景和数据特点进行选择和优化。
二、代码
参考:
https://github.com/james-yoo/DBSCAN
#ifndef DBSCAN_H
#define DBSCAN_H
#include <vector>
#include <cmath>
#define UNCLASSIFIED -1
#define CORE_POINT 1
#define BORDER_POINT 2
#define NOISE -2
#define SUCCESS 0
#define FAILURE -3
using namespace std;
typedef struct Point_
{
float x, y, z; // X, Y, Z position
int clusterID; // clustered ID
}Point;
class DBSCAN {
public:
DBSCAN(unsigned int minPts, float eps, vector<Point> points){
m_minPoints = minPts;
m_epsilon = eps;
m_points = points;
m_pointSize = points.size();
}
~DBSCAN(){}
int run();
vector<int> calculateCluster(Point point);
int expandCluster(Point point, int clusterID);
inline double calculateDistance(const Point& pointCore, const Point& pointTarget);
int getTotalPointSize() {return m_pointSize;}
int getMinimumClusterSize() {return m_minPoints;}
int getEpsilonSize() {return m_epsilon;}
public:
vector<Point> m_points;
private:
unsigned int m_pointSize;
unsigned int m_minPoints;
float m_epsilon;
};
#endif // DBSCAN_H
dbscan.cpp
#include "dbscan.h"
int DBSCAN::run()
{
int clusterID = 1;
vector<Point>::iterator iter;
for(iter = m_points.begin(); iter != m_points.end(); ++iter)
{
if ( iter->clusterID == UNCLASSIFIED )
{
if ( expandCluster(*iter, clusterID) != FAILURE )
{
clusterID += 1;
}
}
}
return 0;
}
int DBSCAN::expandCluster(Point point, int clusterID)
{
vector<int> clusterSeeds = calculateCluster(point);
if ( clusterSeeds.size() < m_minPoints )
{
point.clusterID = NOISE;
return FAILURE;
}
else
{
int index = 0, indexCorePoint = 0;
vector<int>::iterator iterSeeds;
for( iterSeeds = clusterSeeds.begin(); iterSeeds != clusterSeeds.end(); ++iterSeeds)
{
m_points.at(*iterSeeds).clusterID = clusterID;
if (m_points.at(*iterSeeds).x == point.x && m_points.at(*iterSeeds).y == point.y && m_points.at(*iterSeeds).z == point.z )
{
indexCorePoint = index;
}
++index;
}
clusterSeeds.erase(clusterSeeds.begin()+indexCorePoint);
for( vector<int>::size_type i = 0, n = clusterSeeds.size(); i < n; ++i )
{
vector<int> clusterNeighors = calculateCluster(m_points.at(clusterSeeds[i]));
if ( clusterNeighors.size() >= m_minPoints )
{
vector<int>::iterator iterNeighors;
for ( iterNeighors = clusterNeighors.begin(); iterNeighors != clusterNeighors.end(); ++iterNeighors )
{
if ( m_points.at(*iterNeighors).clusterID == UNCLASSIFIED || m_points.at(*iterNeighors).clusterID == NOISE )
{
if ( m_points.at(*iterNeighors).clusterID == UNCLASSIFIED )
{
clusterSeeds.push_back(*iterNeighors);
n = clusterSeeds.size();
}
m_points.at(*iterNeighors).clusterID = clusterID;
}
}
}
}
return SUCCESS;
}
}
vector<int> DBSCAN::calculateCluster(Point point)
{
int index = 0;
vector<Point>::iterator iter;
vector<int> clusterIndex;
for( iter = m_points.begin(); iter != m_points.end(); ++iter)
{
if ( calculateDistance(point, *iter) <= m_epsilon )
{
clusterIndex.push_back(index);
}
index++;
}
return clusterIndex;
}
inline double DBSCAN::calculateDistance(const Point& pointCore, const Point& pointTarget )
{
return pow(pointCore.x - pointTarget.x,2)+pow(pointCore.y - pointTarget.y,2)+pow(pointCore.z - pointTarget.z,2);
}
main.cpp
#include <stdio.h>
#include <iostream>
#include "dbscan.h"
#define MINIMUM_POINTS 4 // minimum number of cluster
#define EPSILON (0.75*0.75) // distance for clustering, metre^2
void readBenchmarkData(vector<Point>& points)
{
// load point cloud
FILE *stream;
stream = fopen ("benchmark_hepta.dat","ra");
unsigned int minpts, num_points, cluster, i = 0;
double epsilon;
fscanf(stream, "%u\n", &num_points);
Point *p = (Point *)calloc(num_points, sizeof(Point));
while (i < num_points)
{
fscanf(stream, "%f,%f,%f,%d\n", &(p[i].x), &(p[i].y), &(p[i].z), &cluster);
p[i].clusterID = UNCLASSIFIED;
points.push_back(p[i]);
++i;
}
free(p);
fclose(stream);
}
void printResults(vector<Point>& points, int num_points)
{
int i = 0;
printf("Number of points: %u\n"
" x y z cluster_id\n"
"-----------------------------\n"
, num_points);
while (i < num_points)
{
printf("%5.2lf %5.2lf %5.2lf: %d\n",
points[i].x,
points[i].y, points[i].z,
points[i].clusterID);
++i;
}
}
int main()
{
vector<Point> points;
// read point data
readBenchmarkData(points);
// constructor
DBSCAN ds(MINIMUM_POINTS, EPSILON, points);
// main loop
ds.run();
// result of DBSCAN algorithm
printResults(ds.m_points, ds.getTotalPointSize());
return 0;
}