使用golang实现k-means

k-means聚类算法

K-Means是一种无监督算法,其目标是将数据进行分类。分类个数要求已知。

k-means流程

  1. 随机确定K个点作为质心、
  2. 找到离每个点最近的质心,将这个点分配到这个质心代表的簇里
  3. 再对每个簇进行计算,以点簇的均值点作为新的质心
  4. 如果新的质心和上一轮的不一样,则迭代进行2-3步骤,直到质心位置稳定

本文目标

  1. 在golang中实现k-means算法。
  2. 使用matplotlib绘制聚类散点图。
  3. 尝试并行处理。
  4. 与sklearn结果对比。

执行结果

左侧:输出

中间:本文效果

右侧:sklearn效果

Q: 为什么样本一样,结果不同?

A: 两方面,首先算法的结束方法中阈值不同,然后是初始k均值点选择不同。

确定数据结构

我把kMeans封装成了对象,此外还有Point。实现如下

go 复制代码
type (
	Point struct {
		X, Y float64
	}
	KMeans struct {
		points       []Point
		k            int
		distanceFunc distanceFunc
		avgPoints    []Point
	}
	distanceFunc func(p1, p2 Point) float64
)

生成随机样本

go 复制代码
func generateRandomPoints(n int) []tool.Point {
	points := make([]tool.Point, n)
	for i := 0; i < n; i++ {
		points[i] = tool.Point{
			X: rand.Float64() * 100,
			Y: rand.Float64() * 100,
		}
	}
	return points
}

确定距离度量函数

go 复制代码
	distance := func(p1, p2 tool.Point) float64 {
		return math.Sqrt(math.Pow(p1.X-p2.X, 2) + math.Pow(p1.Y-p2.Y, 2))
	}

初始化kmeans对象

go 复制代码
func (kMeans *KMeans) Init(k int, points []Point, distanceFunc distanceFunc) {
	kMeans.k = k
	kMeans.points = points
	kMeans.distanceFunc = distanceFunc
	kMeans.avgPoints = make([]Point, kMeans.k)
}
func (kMeans *KMeans) initializeAvgPoints() {
	copy(kMeans.avgPoints, kMeans.points[:kMeans.k])
}

确定算法大致流程

go 复制代码
func (kMeans *KMeans) Do(checkFunc func(oldCentroids, newCentroids []Point) bool) ([][]Point, int) {
	kMeans.initializeAvgPoints()
	var (
		clusters     [][]Point
		tmpAvgPoints []Point
		count        int
	)

	for {
		// 获取聚类含有哪些点
		clusters = kMeans.computeDistanceToAvgPoints()
		// 更新聚类中心
		tmpAvgPoints = kMeans.updateAvgPoints(clusters)
		// 检查质心位置的变化
		if checkFunc(kMeans.avgPoints, tmpAvgPoints) {
			break
		}

		// 更新质心位置
		kMeans.avgPoints = tmpAvgPoints
		count++
	}
	return clusters, count
}

计算聚类(尝试并发)

Q: 为什么可以并发?

A: 因为计算聚类时,对每个点的运算的独立的,依赖的数据不会在计算时修改。

go 复制代码
func (kMeans *KMeans) computeDistanceToAvgPoints() [][]Point {
	type BakPoint struct {
		i int
		p Point
	}
	clusters := make([][]Point, len(kMeans.avgPoints))
	resultCh := make(chan BakPoint, len(kMeans.points))
	for _, point := range kMeans.points {
		computeMinDistanceForSignlePoint := func(point Point, avgPoints []Point, distanceFunc func(p1, p2 Point) float64, ch chan BakPoint) {
			minDistance := struct {
				d float64
				i int
			}{
				d: math.MaxFloat64,
				i: -1,
			}
			for i, avgPoint := range avgPoints {
				if d := distanceFunc(avgPoint, point); d < minDistance.d {
					minDistance.d = d
					minDistance.i = i
				}
			}
			ch <- BakPoint{p: point, i: minDistance.i}
		}
		go computeMinDistanceForSignlePoint(point, kMeans.avgPoints, kMeans.distanceFunc, resultCh)
	}

	for i := 0; i < len(kMeans.points); i++ {
		result := <-resultCh
		clusters[result.i] = append(clusters[result.i], result.p)
	}

	close(resultCh)
	return clusters
}

更新K均值点

go 复制代码
func (kMeans *KMeans) updateAvgPoints(clusters [][]Point) []Point {
	centroids := make([]Point, kMeans.k)
	for i, cluster := range clusters {
		sumX, sumY := 0.0, 0.0
		for _, point := range cluster {
			sumX += point.X
			sumY += point.Y
		}
		centroids[i].X = sumX / float64(len(cluster))
		centroids[i].Y = sumY / float64(len(cluster))
	}
	return centroids
}

使用matplotlib绘制图像

python 复制代码
import matplotlib.pyplot as plt


def readFromFile():
    clusters = []
    current_cluster = []

    with open('points.txt', 'r') as file:
        for line in file:
            if line.strip() == "----":
                if current_cluster:
                    clusters.append(current_cluster)
                    current_cluster = []
            else:
                x, y = map(float, line.split())
                current_cluster.append((x, y))

        if current_cluster:
            clusters.append(current_cluster)

    return clusters


clusters = readFromFile()

for cluster in clusters:
    X = [point[0] for point in cluster]
    Y = [point[1] for point in cluster]
    plt.scatter(X, Y, marker='.')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Clustered Scatter Plot')
plt.show()

使用sklearn计算

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

points = []
with open('rawpoints.txt', 'r') as file:
    for line in file:
        x, y = map(float, line.split())
        points.append([x, y])

X = np.array(points)

n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='.')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=10, color='red', label='Centroids')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

完整代码

项目结构如下

go 复制代码
KMeans-golang
│  go.mod
│  main.go
│  raw.py
│  show.py
└─tool
	kmeans.go

main.go

go 复制代码
package main

import (
	"fmt"
	"k-means/tool"
	"math"
	"math/rand"
	"os"
	"os/exec"
	"time"
)

func generateRandomPoints(n int) []tool.Point {
	points := make([]tool.Point, n)
	for i := 0; i < n; i++ {
		points[i] = tool.Point{
			X: rand.Float64() * 100,
			Y: rand.Float64() * 100,
		}
	}
	return points
}

func wrapper(name string, fun func()) {
	start := time.Now()
	fun()
	elapsed := time.Since(start)
	fmt.Printf("%s 函数执行时间:%s\n", name, elapsed)
	fmt.Printf("%s 函数执行时间(纳秒):%dns\n", name, elapsed.Nanoseconds())
}

func writeToFile(points [][]tool.Point) {
	file, err := os.Create("points.txt")
	if err != nil {
		fmt.Println("Failed to create file:", err)
		return
	}
	defer file.Close()

	for _, row := range points {
		for _, p := range row {
			_, err := fmt.Fprintf(file, "%f %f\n", p.X, p.Y)
			if err != nil {
				fmt.Println("Failed to write to file:", err)
				return
			}
		}
		_, err := fmt.Fprintf(file, "----\n")
		if err != nil {
			fmt.Println("Failed to write to file:", err)
			return
		}
	}
	fmt.Println("Data written to file successfully.")
}
func writeToFile2(points []tool.Point) {
	file, err := os.Create("rawpoints.txt")
	if err != nil {
		fmt.Println("Failed to create file:", err)
		return
	}
	defer file.Close()

	for _, p := range points {
		_, err := fmt.Fprintf(file, "%f %f\n", p.X, p.Y)
		if err != nil {
			fmt.Println("Failed to write to file:", err)
			return
		}
	}
	fmt.Println("Data written to file successfully.")
}
func main() {
	rand.Seed(time.Now().UnixNano())
	// 样本数据
	var points []tool.Point
	wrapper("生成样本", func() {
		points = generateRandomPoints(100)
	})

	k := 5
	distance := func(p1, p2 tool.Point) float64 {
		return math.Sqrt(math.Pow(p1.X-p2.X, 2) + math.Pow(p1.Y-p2.Y, 2))
	}
	kMeansObj := new(tool.KMeans)
	kMeansObj.Init(k, points, distance)

	var (
		finalClusters [][]tool.Point
		count         int
	)

	wrapper("执行算法", func() {
		finalClusters, count = kMeansObj.Do(func(oldCentroids, newCentroids []tool.Point) bool {
			epsilon := 0.000001
			for i := 0; i < len(oldCentroids); i++ {
				if distance(oldCentroids[i], newCentroids[i]) > epsilon {
					return false
				}
			}
			return true
		})
	})
	fmt.Println("count: ", count)
	wrapper("写入文件", func() {
		writeToFile2(points)
		writeToFile(finalClusters)
	})
	go func() {
		command := exec.Command("C:\\Projects\\PycharmProjects\\deelLearn\\venv\\Scripts\\python.exe", "raw.py")
		command.Run()
		command.Wait()
	}()
	command := exec.Command("C:\\Projects\\PycharmProjects\\deelLearn\\venv\\Scripts\\python.exe", "show.py")
	command.Run()
	command.Wait()
}

raw.py

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

points = []
with open('rawpoints.txt', 'r') as file:
    for line in file:
        x, y = map(float, line.split())
        points.append([x, y])

X = np.array(points)

n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='.')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=10, color='red', label='Centroids')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

show.py

python 复制代码
import matplotlib.pyplot as plt


def readFromFile():
    clusters = []
    current_cluster = []

    with open('points.txt', 'r') as file:
        for line in file:
            if line.strip() == "----":
                if current_cluster:
                    clusters.append(current_cluster)
                    current_cluster = []
            else:
                x, y = map(float, line.split())
                current_cluster.append((x, y))

        if current_cluster:
            clusters.append(current_cluster)

    return clusters


clusters = readFromFile()

for cluster in clusters:
    X = [point[0] for point in cluster]
    Y = [point[1] for point in cluster]
    plt.scatter(X, Y, marker='.')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Clustered Scatter Plot')
plt.show()

tool/kmeans.go

go 复制代码
package tool

import "math"

type (
	Point struct {
		X, Y float64
	}
	KMeans struct {
		points       []Point
		k            int
		distanceFunc distanceFunc
		avgPoints    []Point
	}
	distanceFunc func(p1, p2 Point) float64
)

func (kMeans *KMeans) Init(k int, points []Point, distanceFunc distanceFunc) {
	kMeans.k = k
	kMeans.points = points
	kMeans.distanceFunc = distanceFunc
	kMeans.avgPoints = make([]Point, kMeans.k)
}

func (kMeans *KMeans) Do(checkFunc func(oldCentroids, newCentroids []Point) bool) ([][]Point, int) {
	kMeans.initializeAvgPoints()
	var (
		clusters     [][]Point
		tmpAvgPoints []Point
		count        int
	)

	for {
		// 获取聚类含有哪些点
		clusters = kMeans.computeDistanceToAvgPoints()
		// 更新聚类中心
		tmpAvgPoints = kMeans.updateAvgPoints(clusters)
		// 检查质心位置的变化
		if checkFunc(kMeans.avgPoints, tmpAvgPoints) {
			break
		}

		// 更新质心位置
		kMeans.avgPoints = tmpAvgPoints
		count++
	}
	return clusters, count
}

func (kMeans *KMeans) initializeAvgPoints() {
	copy(kMeans.avgPoints, kMeans.points[:kMeans.k])
}

func (kMeans *KMeans) computeDistanceToAvgPoints() [][]Point {
	type BakPoint struct {
		i int
		p Point
	}
	clusters := make([][]Point, len(kMeans.avgPoints))
	resultCh := make(chan BakPoint, len(kMeans.points))
	for _, point := range kMeans.points {
		computeMinDistanceForSignlePoint := func(point Point, avgPoints []Point, distanceFunc func(p1, p2 Point) float64, ch chan BakPoint) {
			minDistance := struct {
				d float64
				i int
			}{
				d: math.MaxFloat64,
				i: -1,
			}
			for i, avgPoint := range avgPoints {
				if d := distanceFunc(avgPoint, point); d < minDistance.d {
					minDistance.d = d
					minDistance.i = i
				}
			}
			ch <- BakPoint{p: point, i: minDistance.i}
		}
		go computeMinDistanceForSignlePoint(point, kMeans.avgPoints, kMeans.distanceFunc, resultCh)
	}

	for i := 0; i < len(kMeans.points); i++ {
		result := <-resultCh
		clusters[result.i] = append(clusters[result.i], result.p)
	}

	close(resultCh)
	return clusters
}

func (kMeans *KMeans) updateAvgPoints(clusters [][]Point) []Point {
	centroids := make([]Point, kMeans.k)
	for i, cluster := range clusters {
		sumX, sumY := 0.0, 0.0
		for _, point := range cluster {
			sumX += point.X
			sumY += point.Y
		}
		centroids[i].X = sumX / float64(len(cluster))
		centroids[i].Y = sumY / float64(len(cluster))
	}
	return centroids
}
相关推荐
这个男人是小帅28 分钟前
【GAT】 代码详解 (1) 运行方法【pytorch】可运行版本
人工智能·pytorch·python·深度学习·分类
Qter_Sean30 分钟前
自己动手写Qt Creator插件
开发语言·qt
__基本操作__30 分钟前
边缘提取函数 [OPENCV--2]
人工智能·opencv·计算机视觉
何曾参静谧34 分钟前
「QT」文件类 之 QIODevice 输入输出设备类
开发语言·qt
Doctor老王35 分钟前
TR3:Pytorch复现Transformer
人工智能·pytorch·transformer
热爱生活的五柒35 分钟前
pytorch中数据和模型都要部署在cuda上面
人工智能·pytorch·深度学习
爱吃生蚝的于勒2 小时前
C语言内存函数
c语言·开发语言·数据结构·c++·学习·算法
HyperAI超神经3 小时前
【TVM 教程】使用 Tensorize 来利用硬件内联函数
人工智能·深度学习·自然语言处理·tvm·计算机技术·编程开发·编译框架
小白学大数据4 小时前
Python爬虫开发中的分析与方案制定
开发语言·c++·爬虫·python
扫地的小何尚4 小时前
NVIDIA RTX 系统上使用 llama.cpp 加速 LLM
人工智能·aigc·llama·gpu·nvidia·cuda·英伟达