面向对象的层次聚类算法

用豆包做了一个面向对象的聚类算法(层次聚类)的代码,先收在这里,有空调试,然后加上其他聚类算法的代码:

复制代码
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
import warnings

# ======================
# 基础配置
# ======================
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings("ignore")


# ======================
# 1. PCA 分析类
# ======================
class PCAAnalyzer:
    def __init__(self, variance_threshold=0.95):
        self.variance_threshold = variance_threshold
        self.scaler = StandardScaler()
        self.pca_full = None
        self.pca_reduced = None
        self.X_scaled = None
        self.X_pca = None

    def fit(self, X):
        """
        X: DataFrame or ndarray(特征数据)
        """
        self.X_scaled = self.scaler.fit_transform(X)

        # 全量 PCA
        self.pca_full = PCA().fit(self.X_scaled)

        # 自动选择维度
        cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)
        self.best_n = np.argmax(cumulative_variance >= self.variance_threshold) + 1

        # 降维
        self.pca_reduced = PCA(n_components=self.best_n)
        self.X_pca = self.pca_reduced.fit_transform(self.X_scaled)

        return self

    def plot_scree(self):
        cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)

        plt.figure(figsize=(10, 6))
        plt.plot(
            range(1, len(self.pca_full.explained_variance_ratio_) + 1),
            self.pca_full.explained_variance_ratio_,
            'o-', color='#1f77b4', linewidth=2, markersize=8,
            label='单个主成分方差'
        )
        plt.plot(
            range(1, len(cumulative_variance) + 1),
            cumulative_variance,
            'ro-', linewidth=2, markersize=8,
            label='累计方差贡献率'
        )
        plt.axhline(y=self.variance_threshold, color='green',
                    linestyle='--', linewidth=2,
                    label=f'{int(self.variance_threshold*100)}% 信息阈值')

        plt.xlabel('主成分维度')
        plt.ylabel('方差贡献率')
        plt.title('PCA 碎石图(判断最佳降维维度)')
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.xticks(range(1, len(self.pca_full.explained_variance_ratio_) + 1))
        plt.show()

    def summary(self):
        cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)
        print(f"\n✅ 建议降维到:{self.best_n} 维(保留 ≥{self.variance_threshold*100}% 的数据信息)\n")
        print("📊 各维度累计方差:")
        for i, var in enumerate(cumulative_variance):
            print(f"第{i+1}维:{var:.2%}")

        print("\n📊 实际降维后方差占比:", self.pca_reduced.explained_variance_ratio_)
        print("📊 总信息保留比例:",
              sum(self.pca_reduced.explained_variance_ratio_).round(4) * 100, "%")

    def get_data(self):
        columns = [f'主成分{i+1}' for i in range(self.best_n)]
        return pd.DataFrame(self.X_pca, columns=columns)


# ======================
# 2. KMeans 聚类类
# ======================
class ClusterAnalyzer:
    def __init__(self, n_clusters=4):
        self.n_clusters = n_clusters
        self.model = KMeans(
            n_clusters=self.n_clusters,
            random_state=42,
            n_init='auto'
        )
        self.labels = None

    def fit(self, X):
        self.labels = self.model.fit_predict(X)
        return self

    def evaluate(self, X):
        score = silhouette_score(X, self.labels)
        print(f"\n聚类类别数:{self.n_clusters}")
        print(f"轮廓系数(越接近1越好):{score:.3f}")
        print(f"每类样本数量:{np.bincount(self.labels)}")
        return score


# ======================
# 3. 层次聚类类
# ======================
class HierarchicalAnalyzer:
    def __init__(self, n_clusters=2, linkage='ward'):
        self.n_clusters = n_clusters
        self.linkage = linkage

    def plot_dendrogram(self, X):
        plt.figure(figsize=(12, 5))
        linkage_matrix = sch.linkage(X, method=self.linkage)
        sch.dendrogram(linkage_matrix)
        plt.axhline(y=3.5, color='red', linestyle='--',
                    label=f'分成 {self.n_clusters} 类')
        plt.title('聚类树状图 (Dendrogram)')
        plt.xlabel('样本编号')
        plt.ylabel('聚类距离')
        plt.legend()
        plt.show()

    def fit(self, X):
        model = AgglomerativeClustering(
            n_clusters=self.n_clusters,
            metric='euclidean',
            linkage=self.linkage
        )
        labels = model.fit_predict(X)

        print("✅ 层次聚类分类结果:")
        print(labels)
        print("类别数量统计:", np.bincount(labels))
        return labels


# ======================
# 4. 流水线封装
# ======================
class Pipeline:
    def __init__(self, csv_path, variance_threshold=0.95,
                 kmeans_clusters=4, hier_clusters=2):
        self.csv_path = csv_path
        self.pca = PCAAnalyzer(variance_threshold)
        self.kmeans = ClusterAnalyzer(kmeans_clusters)
        self.hier = HierarchicalAnalyzer(hier_clusters)

    def run(self):
        # 读取数据
        data = pd.read_csv(self.csv_path)
        X = data.iloc[:, 2:]

        # PCA
        self.pca.fit(X)
        self.pca.plot_scree()
        self.pca.summary()

        X_pca_df = self.pca.get_data()

        # KMeans
        self.kmeans.fit(X_pca_df)
        self.kmeans.evaluate(X_pca_df)

        # 层次聚类
        self.hier.plot_dendrogram(X_pca_df.values)
        self.hier.fit(X_pca_df.values)

        # 导出结果
        X_pca_df.to_excel("PCA降维结果表.xlsx", index=False)
        print("\n✅ 所有流程完成,结果已导出")


# ======================
# 5. 一键运行
# ======================
if __name__ == "__main__":
    pipeline = Pipeline(
        csv_path=r'C:/Users/cy/Desktop/20260513/country.csv',
        variance_threshold=0.95,
        kmeans_clusters=4,
        hier_clusters=2
    )
    pipeline.run()

暂时从网络中扒出来一个对比所有选项的算法,没有运行不知道是否有问题,暂时放到这里,有空研究:

复制代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')  # 忽略警告,让输出更干净

# ===================== 1. 生成/加载你的数据 =====================
# 这里用模拟数据,你可以替换成自己的:data = pd.read_csv("你的数据.csv").values
np.random.seed(42)
data = np.random.randn(500, 50)  # 500个样本,50个特征

# ===================== 2. 设置你要遍历的三个参数范围 =====================
# 【你可以自由修改这三个列表】
threshold_list = [0.5, 0.6, 0.7, 0.8, 0.9]        # 阈值列表
pca_components_list = [5, 10, 15, 20, 25]         # PCA主成分数
cluster_nums_list = [2, 3, 4, 5, 6]               # 层次聚类类别数

# ===================== 3. 网格搜索所有组合,计算轮廓系数 =====================
results = []  # 保存所有组合的结果
print("正在遍历所有参数组合并计算轮廓系数...\n")

# 标准化数据(聚类必须标准化)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# 三重循环遍历所有组合
for threshold in threshold_list:
    # 步骤1:按阈值筛选特征(可选,按你的业务逻辑)
    # 这里用方差阈值筛选,你可以改成自己的阈值逻辑
    variances = np.var(data_scaled, axis=0)
    selected_features = variances >= threshold
    data_filtered = data_scaled[:, selected_features]
    
    # 如果过滤后没有特征,跳过
    if data_filtered.shape[1] == 0:
        continue
    
    for n_comp in pca_components_list:
        # 步骤2:PCA降维
        # 主成分数不能超过特征数
        n_pca = min(n_comp, data_filtered.shape[1])
        pca = PCA(n_components=n_pca, random_state=42)
        data_pca = pca.fit_transform(data_filtered)
        
        for n_clusters in cluster_nums_list:
            # 步骤3:层次聚类
            cluster = AgglomerativeClustering(
                n_clusters=n_clusters,
                linkage='ward'  # 常用聚类方式,可改:single/average/complete
            )
            labels = cluster.fit_predict(data_pca)
            
            # 步骤4:计算轮廓系数(越大越好)
            # 至少2个簇才能计算
            if len(np.unique(labels)) >= 2:
                score = silhouette_score(data_pca, labels)
            else:
                score = -1  # 无效值
            
            # 保存结果
            results.append({
                "阈值": threshold,
                "PCA主成分数": n_comp,
                "聚类类别数": n_clusters,
                "轮廓系数": round(score, 4)
            })

# ===================== 4. 整理成表格 =====================
df_results = pd.DataFrame(results)

# 按轮廓系数降序排序
df_sorted = df_results.sort_values(by="轮廓系数", ascending=False).reset_index(drop=True)

# ===================== 5. 找出最优组合 =====================
best = df_sorted.iloc[0]  # 第一行就是最大值

# ===================== 6. 输出结果 =====================
print("="*80)
print("📊 所有参数组合轮廓系数对比表(从高到低排序)")
print("="*80)
print(df_sorted.to_string(index=False))

print("\n" + "="*80)
print("✅ 最优参数组合(轮廓系数最大)")
print("="*80)
print(f"最大轮廓系数:{best['轮廓系数']}")
print(f"最优阈值:{best['阈值']}")
print(f"最优PCA主成分数:{int(best['PCA主成分数'])}")
print(f"最优层次聚类类别数:{int(best['聚类类别数'])}")
相关推荐
Ares-Wang1 小时前
AI》》人工智能》》AIGC》》deepseek常见用法 PPT、思维导图等
人工智能·python
m0_631529821 小时前
如何创建物化视图日志_CREATE MATERIALIZED VIEW LOG记录基表DML变更
jvm·数据库·python
m0_702036531 小时前
Layui表格渲染如何处理字段名为JSON关键字(如order)的情况
jvm·数据库·python
m0_591364731 小时前
mysql连接查询中包含大表如何优化_采用嵌套循环JOIN优化顺序
jvm·数据库·python
风落无尘1 小时前
《智能重生:从垃圾堆到AI工程师》——第九章 语言与理解
人工智能·python·卷积神经网络
2401_884454151 小时前
golang如何给图片添加水印_golang图片添加水印解析
jvm·数据库·python
hongjianMa1 小时前
【论文阅读】Structured Spectral Reasoning for Frequency-Adaptive Multimodal Recommendation
论文阅读·python·深度学习·推荐系统·多模态推荐
kexnjdcncnxjs1 小时前
如何用SQL统计每组的平均值同时显示原行_OVER子句
jvm·数据库·python
CLX05051 小时前
Redis如何防范脑裂导致的数据丢失_配置min-replicas-to-write强制要求可用从节点数
jvm·数据库·python