用豆包做了一个面向对象的聚类算法(层次聚类)的代码,先收在这里,有空调试,然后加上其他聚类算法的代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
import warnings
# ======================
# 基础配置
# ======================
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings("ignore")
# ======================
# 1. PCA 分析类
# ======================
class PCAAnalyzer:
def __init__(self, variance_threshold=0.95):
self.variance_threshold = variance_threshold
self.scaler = StandardScaler()
self.pca_full = None
self.pca_reduced = None
self.X_scaled = None
self.X_pca = None
def fit(self, X):
"""
X: DataFrame or ndarray(特征数据)
"""
self.X_scaled = self.scaler.fit_transform(X)
# 全量 PCA
self.pca_full = PCA().fit(self.X_scaled)
# 自动选择维度
cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)
self.best_n = np.argmax(cumulative_variance >= self.variance_threshold) + 1
# 降维
self.pca_reduced = PCA(n_components=self.best_n)
self.X_pca = self.pca_reduced.fit_transform(self.X_scaled)
return self
def plot_scree(self):
cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)
plt.figure(figsize=(10, 6))
plt.plot(
range(1, len(self.pca_full.explained_variance_ratio_) + 1),
self.pca_full.explained_variance_ratio_,
'o-', color='#1f77b4', linewidth=2, markersize=8,
label='单个主成分方差'
)
plt.plot(
range(1, len(cumulative_variance) + 1),
cumulative_variance,
'ro-', linewidth=2, markersize=8,
label='累计方差贡献率'
)
plt.axhline(y=self.variance_threshold, color='green',
linestyle='--', linewidth=2,
label=f'{int(self.variance_threshold*100)}% 信息阈值')
plt.xlabel('主成分维度')
plt.ylabel('方差贡献率')
plt.title('PCA 碎石图(判断最佳降维维度)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xticks(range(1, len(self.pca_full.explained_variance_ratio_) + 1))
plt.show()
def summary(self):
cumulative_variance = np.cumsum(self.pca_full.explained_variance_ratio_)
print(f"\n✅ 建议降维到:{self.best_n} 维(保留 ≥{self.variance_threshold*100}% 的数据信息)\n")
print("📊 各维度累计方差:")
for i, var in enumerate(cumulative_variance):
print(f"第{i+1}维:{var:.2%}")
print("\n📊 实际降维后方差占比:", self.pca_reduced.explained_variance_ratio_)
print("📊 总信息保留比例:",
sum(self.pca_reduced.explained_variance_ratio_).round(4) * 100, "%")
def get_data(self):
columns = [f'主成分{i+1}' for i in range(self.best_n)]
return pd.DataFrame(self.X_pca, columns=columns)
# ======================
# 2. KMeans 聚类类
# ======================
class ClusterAnalyzer:
def __init__(self, n_clusters=4):
self.n_clusters = n_clusters
self.model = KMeans(
n_clusters=self.n_clusters,
random_state=42,
n_init='auto'
)
self.labels = None
def fit(self, X):
self.labels = self.model.fit_predict(X)
return self
def evaluate(self, X):
score = silhouette_score(X, self.labels)
print(f"\n聚类类别数:{self.n_clusters}")
print(f"轮廓系数(越接近1越好):{score:.3f}")
print(f"每类样本数量:{np.bincount(self.labels)}")
return score
# ======================
# 3. 层次聚类类
# ======================
class HierarchicalAnalyzer:
def __init__(self, n_clusters=2, linkage='ward'):
self.n_clusters = n_clusters
self.linkage = linkage
def plot_dendrogram(self, X):
plt.figure(figsize=(12, 5))
linkage_matrix = sch.linkage(X, method=self.linkage)
sch.dendrogram(linkage_matrix)
plt.axhline(y=3.5, color='red', linestyle='--',
label=f'分成 {self.n_clusters} 类')
plt.title('聚类树状图 (Dendrogram)')
plt.xlabel('样本编号')
plt.ylabel('聚类距离')
plt.legend()
plt.show()
def fit(self, X):
model = AgglomerativeClustering(
n_clusters=self.n_clusters,
metric='euclidean',
linkage=self.linkage
)
labels = model.fit_predict(X)
print("✅ 层次聚类分类结果:")
print(labels)
print("类别数量统计:", np.bincount(labels))
return labels
# ======================
# 4. 流水线封装
# ======================
class Pipeline:
def __init__(self, csv_path, variance_threshold=0.95,
kmeans_clusters=4, hier_clusters=2):
self.csv_path = csv_path
self.pca = PCAAnalyzer(variance_threshold)
self.kmeans = ClusterAnalyzer(kmeans_clusters)
self.hier = HierarchicalAnalyzer(hier_clusters)
def run(self):
# 读取数据
data = pd.read_csv(self.csv_path)
X = data.iloc[:, 2:]
# PCA
self.pca.fit(X)
self.pca.plot_scree()
self.pca.summary()
X_pca_df = self.pca.get_data()
# KMeans
self.kmeans.fit(X_pca_df)
self.kmeans.evaluate(X_pca_df)
# 层次聚类
self.hier.plot_dendrogram(X_pca_df.values)
self.hier.fit(X_pca_df.values)
# 导出结果
X_pca_df.to_excel("PCA降维结果表.xlsx", index=False)
print("\n✅ 所有流程完成,结果已导出")
# ======================
# 5. 一键运行
# ======================
if __name__ == "__main__":
pipeline = Pipeline(
csv_path=r'C:/Users/cy/Desktop/20260513/country.csv',
variance_threshold=0.95,
kmeans_clusters=4,
hier_clusters=2
)
pipeline.run()
暂时从网络中扒出来一个对比所有选项的算法,没有运行不知道是否有问题,暂时放到这里,有空研究:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore') # 忽略警告,让输出更干净
# ===================== 1. 生成/加载你的数据 =====================
# 这里用模拟数据,你可以替换成自己的:data = pd.read_csv("你的数据.csv").values
np.random.seed(42)
data = np.random.randn(500, 50) # 500个样本,50个特征
# ===================== 2. 设置你要遍历的三个参数范围 =====================
# 【你可以自由修改这三个列表】
threshold_list = [0.5, 0.6, 0.7, 0.8, 0.9] # 阈值列表
pca_components_list = [5, 10, 15, 20, 25] # PCA主成分数
cluster_nums_list = [2, 3, 4, 5, 6] # 层次聚类类别数
# ===================== 3. 网格搜索所有组合,计算轮廓系数 =====================
results = [] # 保存所有组合的结果
print("正在遍历所有参数组合并计算轮廓系数...\n")
# 标准化数据(聚类必须标准化)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# 三重循环遍历所有组合
for threshold in threshold_list:
# 步骤1:按阈值筛选特征(可选,按你的业务逻辑)
# 这里用方差阈值筛选,你可以改成自己的阈值逻辑
variances = np.var(data_scaled, axis=0)
selected_features = variances >= threshold
data_filtered = data_scaled[:, selected_features]
# 如果过滤后没有特征,跳过
if data_filtered.shape[1] == 0:
continue
for n_comp in pca_components_list:
# 步骤2:PCA降维
# 主成分数不能超过特征数
n_pca = min(n_comp, data_filtered.shape[1])
pca = PCA(n_components=n_pca, random_state=42)
data_pca = pca.fit_transform(data_filtered)
for n_clusters in cluster_nums_list:
# 步骤3:层次聚类
cluster = AgglomerativeClustering(
n_clusters=n_clusters,
linkage='ward' # 常用聚类方式,可改:single/average/complete
)
labels = cluster.fit_predict(data_pca)
# 步骤4:计算轮廓系数(越大越好)
# 至少2个簇才能计算
if len(np.unique(labels)) >= 2:
score = silhouette_score(data_pca, labels)
else:
score = -1 # 无效值
# 保存结果
results.append({
"阈值": threshold,
"PCA主成分数": n_comp,
"聚类类别数": n_clusters,
"轮廓系数": round(score, 4)
})
# ===================== 4. 整理成表格 =====================
df_results = pd.DataFrame(results)
# 按轮廓系数降序排序
df_sorted = df_results.sort_values(by="轮廓系数", ascending=False).reset_index(drop=True)
# ===================== 5. 找出最优组合 =====================
best = df_sorted.iloc[0] # 第一行就是最大值
# ===================== 6. 输出结果 =====================
print("="*80)
print("📊 所有参数组合轮廓系数对比表(从高到低排序)")
print("="*80)
print(df_sorted.to_string(index=False))
print("\n" + "="*80)
print("✅ 最优参数组合(轮廓系数最大)")
print("="*80)
print(f"最大轮廓系数:{best['轮廓系数']}")
print(f"最优阈值:{best['阈值']}")
print(f"最优PCA主成分数:{int(best['PCA主成分数'])}")
print(f"最优层次聚类类别数:{int(best['聚类类别数'])}")