无监督学习:聚类/降维/异常检测
1. 聚类算法
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
# K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X)
# 肘部法则选择 K
inertias = []
for k in range(2, 11):
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
inertias.append(km.inertia_)
# DBSCAN(密度聚类)
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)
# 层次聚类
hc = AgglomerativeClustering(n_clusters=3)
labels = hc.fit_predict(X)
2. 降维算法
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f"解释方差比: {pca.explained_variance_ratio_}")
# t-SNE(可视化用)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)
3. 异常检测
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
# 孤立森林
iso = IsolationForest(contamination=0.1, random_state=42)
outliers = iso.fit_predict(X) # -1 为异常
# One-Class SVM
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
outliers = ocsvm.fit_predict(X)
总结
| 任务 |
算法 |
适用场景 |
| 聚类 |
K-Means/DBSCAN |
客户分群/图像分割 |
| 降维 |
PCA/t-SNE |
可视化/去噪 |
| 异常检测 |
Isolation Forest |
欺诈检测/故障诊断 |