使用R语言实现kmeans算法底层代码,并对HighDensity_Scatter_Data数据进行聚类分析(不直接使用kmeans函数)
R
# 导包
library(ggplot2)
library(RColorBrewer)
# 加载数据
mydata <- read.table("D:/RWorkPlace/K-means聚类分析/K-means聚类分析/HighDensity_Scatter_Data.csv", header = TRUE, sep = ",")
#画出原始数据的散点图,这是个带透明度设置的黑色散点图
ggplot(data = mydata, aes(x,y)) +
geom_point( colour="black",alpha=0.1)+
labs(x = "Axis X",y="Axis Y")+
theme(
text=element_text(size=15,color="black"),
plot.title=element_text(size=15,family="myfont",face="bold.italic",hjust=.5,color="black"),
legend.position="none"
)
# 指定将数据集分成 k 个类别
k <- 2 # 指定质心数量
# step:0,设定质心,nrow读取有问题不知道为什么,我直接手动设置行列了
m <- 10000 # 数据集样本数m
n <- 2 # 维度dim
centers <- matrix(0, nrow = 10000, ncol = 2) # 定义聚类中心存储矩阵
# 根据设定的聚类中心数随机获取样本中的点作为初始聚类中心坐标
for (i in 1:k) {
index <- sample(1:m, 1) # 随机获取一个索引
centers[i,1 ] <- mydata[index, 1] # 将x中的第index行赋值给centers的第i行
centers[i,2 ] <- mydata[index, 2]
}
# KMeans核心实现代码
for (i in 1:600) {
# 备份一下质心数据
t_center <- centers
# step1: 计算距离
dist_list <- matrix(0, nrow = m, ncol = k)
for (j in 1:k) {
dist_list[, j] <- sqrt(rowSums((mydata - centers[j, ])^2) + 1)
}
# step2: 计算极值
argmin_dist <- apply(dist_list, 1, which.min)
# step3: 更新质心
for (j in 1:k) {
cluster_points <- mydata[argmin_dist == j, ]
if (nrow(cluster_points) > 0) {
centers[j, ] <- apply(cluster_points, 2, mean)
}
}
# step4: 重复1、2、3步,直到质心不再发生改变
if (identical(t_center, centers)) {
print(i)
break
}
}
cat("质心坐标矩阵:\n")
print(centers)
cat("\n各样本点标签值:\n")
print(argmin_dist)
#创建cluster列映射颜色
mydata$cluster <- as.factor(argmin_dist)
#将聚类结果映射到每个点上,给每个点赋予颜色
ggplot(data = mydata, aes(x,y,color=cluster)) +
geom_point( alpha=0.2)+
scale_color_manual(values=c("#00AFBB", "#FC4E07"))+
labs(x = "Axis X",y="Axis Y")+
theme(
text=element_text(size=15,color="black"),
plot.title=element_text(size=15,family="myfont",face="bold.italic",color="black"),
legend.background=element_blank(),
legend.position=c(0.85,0.15)
运行结果
