2024最新分别利用sklearn和Numpy实现c均值对鸢尾花数据集进行聚类(附完整代码和注释)

C均值聚类算法(K-Means Clustering)是一种非常流行的聚类算法,用于将数据点分成多个簇,使得簇内的点尽可能相似,簇间的点尽可能不同。以下是K-Means算法的基本步骤:

  1. 初始化:随机选择K个点作为初始的簇中心(质心)

  2. 分配:将每个数据点分配到最近的质心所属的簇中。

  3. 更新:计算每个簇中所有点的均值,更新质心为这个均值。

  4. 迭代:重复步骤2和3,直到满足某个终止条件(例如,达到最大迭代次数,或者质心的变化小于某个阈值)。

  5. 终止:当满足终止条件时,算法结束,最终的簇划分就是聚类结果。

sklearn方法

复制代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# 加载鸢尾花数据集
iris = datasets.load_iris()
X = iris.data

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 定义不同的K值
k_values = [2, 3, 4, 5]

# 评估不同K值的聚类效果
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    labels = kmeans.labels_
    
    # 计算轮廓系数
    silhouette_avg = silhouette_score(X_scaled, labels)
    print(f"For n_clusters = {k}, silhouette score is {silhouette_avg}")

    # 可视化聚类效果
    plt.figure(figsize=(8, 6))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis', marker='o', label='Cluster')
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='*', label='Centroids')
    plt.title(f'K-Means Clustering with n_clusters = {k}')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

# 评估不同初始化方法的聚类效果
k = 3
#
init_methods = ['random', 'k-means++']
for init in init_methods:
    kmeans = KMeans(n_clusters=k, init=init, random_state=42)
    kmeans.fit(X_scaled)
    labels = kmeans.labels_
    
    # 计算轮廓系数
    silhouette_avg = silhouette_score(X_scaled, labels)
    print(f"For n_clusters = {k}, init method = {init}, silhouette score is {silhouette_avg}")

    # 可视化聚类效果
    plt.figure(figsize=(8, 6))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis', marker='o', label='Cluster')
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='*', label='Centroids')
    plt.title(f'K-Means Clustering with n_clusters = {k}, init method = {init}')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

Numpy方法

复制代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

#使用 NumPy 实现 K-Means 算法
def kmeans(X, k, max_iters=100):
    n_samples, n_features = X.shape
    centroids = X[np.random.choice(n_samples, k, replace=False)]
    for _ in range(max_iters):
        distances = np.sqrt((X[:, np.newaxis] - centroids) ** 2).sum(axis=2)
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return labels, centroids

#加载鸢尾花数据集并对其进行标准化
iris = datasets.load_iris()
X = iris.data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#评估不同 c 值的聚类效果
from sklearn.metrics import silhouette_score
k_values = [2, 3, 4]
for k in k_values:
    labels, centers = kmeans(X_scaled, k)
    silhouette_avg = silhouette_score(X_scaled, labels)
    print(f"For n_clusters = {k}, silhouette score is {silhouette_avg}")
#可视化每个 k 值的聚类结果。
for k in k_values:
    labels, centers = kmeans(X_scaled, k)
    plt.figure(figsize=(8, 6))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis', marker='o', label='Cluster')
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='*', label='Centroids')
    plt.title(f'K-Means Clustering with n_clusters = {k}')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()
    
#评估不同初始化方法对聚类效果的影响
def kmeans_plusplus(X, k, max_iters=100):
    n_samples, n_features = X.shape
    centroids = [X[np.random.choice(n_samples)]]
    for _ in range(1, k):
        distances = np.sqrt((X[:, np.newaxis] - centroids) ** 2).sum(axis=2)
        probabilities = distances.min(axis=1) ** 2
        cumulative_probabilities = probabilities.cumsum()
        r = np.random.rand() * cumulative_probabilities[-1]
        new_centroid_index = np.searchsorted(cumulative_probabilities, r)
        centroids.append(X[new_centroid_index])
    
    centroids = np.array(centroids)
    for _ in range(max_iters):
        distances = np.sqrt((X[:, np.newaxis] - centroids) ** 2).sum(axis=2)
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return labels, centroids

for k in k_values:
    labels, centers = kmeans_plusplus(X_scaled, k)
    silhouette_avg = silhouette_score(X_scaled, labels)
    print(f"For n_clusters = {k}, silhouette score (k-means++) is {silhouette_avg}")

    plt.figure(figsize=(8, 6))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis', marker='o', label='Cluster')
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='*', label='Centroids')
    plt.title(f'K-Means++ Clustering with n_clusters = {k}')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()
相关推荐
郝学胜-神的一滴11 小时前
机器学习特征选择:深入理解移除低方差特征与sklearn的VarianceThreshold
开发语言·人工智能·python·机器学习·概率论·sklearn
七夜zippoe3 天前
NumPy向量化计算实战:从入门到精通的性能优化指南
python·性能优化·架构·numpy·广播机制·ufunc
(; ̄ェ ̄)。3 天前
机器学习入门(九)为什么sklearn正规方程法矩阵不可逆却可以计算出结果
机器学习·矩阵·sklearn
小饼干超人3 天前
如何兼容不同版本的 scikit-learn(sklearn)库,统一获取“均方根误差(RMSE)”的计算函数
python·scikit-learn·sklearn
郝学胜-神的一滴3 天前
机器学习数据预处理:深入理解标准化与sklearn的StandardScaler
开发语言·人工智能·python·程序人生·机器学习·sklearn
做科研的周师兄4 天前
【MATLAB 实战】|多波段栅格数据提取部分波段均值——批量处理(NoData 修正 + 地理信息保真)_后附完整代码
前端·算法·机器学习·matlab·均值算法·分类·数据挖掘
郝学胜-神的一滴5 天前
机器学习数据预处理:归一化与sklearn的MinMaxScaler详解
人工智能·python·程序人生·机器学习·性能优化·sklearn
one day3215 天前
从numpy-pillow-opencv的基础学习
opencv·numpy·pillow
lrh1228006 天前
Numpy学习
numpy
拾贰_C7 天前
[python | numpy] numpy& matplotib冲突
开发语言·python·numpy