[机器学习]基于K-means聚类算法的鸢尾花数据及分类

基于Kmeans,对鸢尾花数据集前两个特征进行聚类分析

  • 通过迭代优化,将150个样本划分到K个簇中。

  • 目标函数:最小化所有样本到其所属簇中心的距离平方和。

  • 算法步骤:

    1. 随机初始化K个簇中心。

    2. 将每个样本分配到最近的中心。

    3. 计算均值确定每个簇的中心(均值)。

    4. 重复第2和3步直到稳定收敛。

程序代码:

python 复制代码
import math

import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

data = datasets.load_iris().data
labels = datasets.load_iris().target
print('数据维度',data.shape)
features = data[:,: 2]
print('特征',features)

num_clusters = 6
epoch = 150
J_sum = []

def J_calculate(features,divide_re,center):
    J = 0
    for s1 in range(150):
        distances = ((features[s1][0]-center[divide_re[s1]][0]) ** 2) + ((features[s1][1]-center[divide_re[s1]][1]) ** 2)
        #print(distances)
        J = J + distances
    return J

def decision(features,divide_re,center,epoch):
    J_best = []
    for _ in range(epoch):
        J_b = math.inf
        for s1 in range(150):
            best = None
            min_J_now = math.inf
            for s2 in range(len(center)):
                divide_re[s1] = s2
                J_now = J_calculate(features,divide_re,center)
                if J_now < min_J_now:
                    min_J_now = J_now
                    best = s2
            divide_re[s1] = best
            for i in range(len(center)):
                xc = []
                yc = []
                for j in range(150):
                    if (divide_re[j] == i):
                        xc.append(features[j][0])
                        yc.append(features[j][1])
                center[i] = [np.mean(xc), np.mean(yc)]

            if(min_J_now<J_b):
                J_b = min_J_now
        J_best.append(J_b)

    return features,divide_re,center,J_best

for i in range(2,num_clusters+1):
    print(f'\n分{i}类:\n')
    center = features[np.random.choice(features.shape[0], i, replace=False)]
    print("初始中心点", center)
    distances = np.linalg.norm(features[:, np.newaxis, :] - center, axis=2)
    divide = np.argmin(distances,axis=1)
    divide_re = []
    for x in range(150):
        divide_re.append(divide[x])
    print("初始样本分类", divide_re)
    features,divide_re,center,J_best = decision(features,divide_re,center,epoch)
    print(f'{i}类最佳J值为:',J_best[epoch-1])
    J_sum.append(J_best[epoch-1])
    plt.scatter(features[:, 0], features[:, 1], c=divide_re, cmap='viridis', edgecolors='k')
    plt.scatter(center[:, 0], center[:, 1], marker='x', s=30, linewidths=3, color='red')
    plt.title(f'{i}类C均值分类法结果')
    plt.xlabel('第一特征')
    plt.ylabel('第二特征')
    plt.show()
plt.figure()
plt.plot(range(2, num_clusters + 1), J_sum, marker='o')
plt.title('J与类别数量关系曲线')
plt.xlabel('类别数量')
plt.ylabel('J_sum 值')
plt.show()

运行结果:

数据维度 (150, 4)

特征 [[5.1 3.5]

4.9 3.

4.7 3.2

4.6 3.1

5. 3.6

5.4 3.9

4.6 3.4

5. 3.4

4.4 2.9

4.9 3.1

5.4 3.7

4.8 3.4

4.8 3.

4.3 3.

5.8 4.

5.7 4.4

5.4 3.9

5.1 3.5

5.7 3.8

5.1 3.8

5.4 3.4

5.1 3.7

4.6 3.6

5.1 3.3

4.8 3.4

5. 3.

5. 3.4

5.2 3.5

5.2 3.4

4.7 3.2

4.8 3.1

5.4 3.4

5.2 4.1

5.5 4.2

4.9 3.1

5. 3.2

5.5 3.5

4.9 3.6

4.4 3.

5.1 3.4

5. 3.5

4.5 2.3

4.4 3.2

5. 3.5

5.1 3.8

4.8 3.

5.1 3.8

4.6 3.2

5.3 3.7

5. 3.3

7. 3.2

6.4 3.2

6.9 3.1

5.5 2.3

6.5 2.8

5.7 2.8

6.3 3.3

4.9 2.4

6.6 2.9

5.2 2.7

5. 2.

5.9 3.

6. 2.2

6.1 2.9

5.6 2.9

6.7 3.1

5.6 3.

5.8 2.7

6.2 2.2

5.6 2.5

5.9 3.2

6.1 2.8

6.3 2.5

6.1 2.8

6.4 2.9

6.6 3.

6.8 2.8

6.7 3.

6. 2.9

5.7 2.6

5.5 2.4

5.5 2.4

5.8 2.7

6. 2.7

5.4 3.

6. 3.4

6.7 3.1

6.3 2.3

5.6 3.

5.5 2.5

5.5 2.6

6.1 3.

5.8 2.6

5. 2.3

5.6 2.7

5.7 3.

5.7 2.9

6.2 2.9

5.1 2.5

5.7 2.8

6.3 3.3

5.8 2.7

7.1 3.

6.3 2.9

6.5 3.

7.6 3.

4.9 2.5

7.3 2.9

6.7 2.5

7.2 3.6

6.5 3.2

6.4 2.7

6.8 3.

5.7 2.5

5.8 2.8

6.4 3.2

6.5 3.

7.7 3.8

7.7 2.6

6. 2.2

6.9 3.2

5.6 2.8

7.7 2.8

6.3 2.7

6.7 3.3

7.2 3.2

6.2 2.8

6.1 3.

6.4 2.8

7.2 3.

7.4 2.8

7.9 3.8

6.4 2.8

6.3 2.8

6.1 2.6

7.7 3.

6.3 3.4

6.4 3.1

6. 3.

6.9 3.1

6.7 3.1

6.9 3.1

5.8 2.7

6.8 3.2

6.7 3.3

6.7 3.

6.3 2.5

6.5 3.

6.2 3.4

5.9 3. \]

分2类:

初始中心点 [[6.4 3.1]

7.2 3.6\]

初始样本分类 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2类最佳J值为: 58.20409278906674

分3类:

初始中心点 [[5.4 3.4]

5.4 3.4

7.7 2.8\]

初始样本分类 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0]

3类最佳J值为: 58.20409278906674

分4类:

初始中心点 [[6.7 3.1]

6.4 2.7

6.5 3.2

5.5 2.4\]

初始样本分类 [3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 0, 2, 0, 3, 1, 3, 2, 3, 0, 3, 3, 1, 3, 1, 3, 0, 3, 3, 1, 3, 2, 1, 1, 1, 1, 0, 0, 0, 1, 3, 3, 3, 3, 1, 3, 2, 0, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 0, 1, 2, 0, 3, 0, 1, 0, 2, 1, 0, 3, 3, 2, 2, 0, 0, 3, 0, 3, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2, 2, 1, 0, 0, 0, 3, 0, 0, 0, 1, 2, 2, 1]

4类最佳J值为: 28.23339146670904

分5类:

初始中心点 [[6.3 2.5]

5.1 3.5

6.4 3.2

7.1 3.

5.5 3.5\]

初始样本分类 [1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 4, 4, 4, 1, 4, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 3, 0, 0, 0, 2, 1, 2, 1, 0, 2, 0, 2, 4, 2, 4, 0, 0, 0, 2, 0, 0, 0, 2, 2, 3, 2, 0, 0, 0, 0, 0, 0, 4, 2, 2, 0, 4, 0, 0, 2, 0, 1, 0, 4, 4, 2, 1, 0, 2, 0, 3, 2, 2, 3, 1, 3, 0, 3, 2, 0, 3, 0, 0, 2, 2, 3, 3, 0, 3, 4, 3, 0, 2, 3, 0, 2, 0, 3, 3, 3, 0, 0, 0, 3, 2, 2, 2, 3, 2, 3, 0, 3, 2, 2, 0, 2, 2, 2]

5类最佳J值为: 21.200013093214928

分6类:

初始中心点 [[6.8 2.8]

5.8 2.6

4.4 3.

6.2 3.4

6.4 3.2

6. 3. \]

初始样本分类 [2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 2, 3, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 3, 3, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 4, 0, 1, 0, 1, 3, 2, 0, 1, 1, 5, 1, 5, 1, 4, 5, 1, 1, 1, 5, 5, 1, 5, 4, 4, 0, 0, 5, 1, 1, 1, 1, 1, 1, 3, 4, 1, 5, 1, 1, 5, 1, 1, 1, 5, 1, 5, 1, 1, 3, 1, 0, 5, 4, 0, 2, 0, 0, 4, 4, 0, 0, 1, 1, 4, 4, 0, 0, 1, 0, 1, 0, 5, 4, 0, 5, 5, 0, 0, 0, 0, 0, 5, 1, 0, 3, 4, 5, 0, 4, 0, 1, 4, 4, 0, 1, 4, 3, 5]

6类最佳J值为: 18.150987445152886

进程已结束,退出代码0