from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
X = your_embedding_or_pca_array # shape = (N, D)
best_k = None
best_score = -1
scores = []
for k in range(2, 15):
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X)
score = silhouette_score(X, labels)
scores.append(score)
if score > best_score:
best_score = score
best_k = k
print(f"最佳 k = {best_k}, 得分 = {best_score:.4f}")
可视化轮廓系数随 k 的变化
import matplotlib.pyplot as plt
plt.plot(range(2, 15), scores, marker='o')
plt.title("Silhouette Score vs K")
plt.xlabel("K")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()