K均值
1.1 数据来源(随机生成)
python
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
# plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', edgecolors='black', s=50)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.tight_layout()
plt.savefig('./fig/data presentation.png')
plt.show()
make_blobs的用法
make_blobs函数是为聚类产生数据集,产生一个数据集和相应的标签
- n_samples:表示数据样本点个数,默认值100
- n_features:是每个样本的特征(或属性)数,也表示数据的维度,默认值是2
- centers:表示类别数(标签的种类数),默认值3
- cluster_std表示每个类别的方差,例如我们希望生成2类数据,其中一类比另一类具有更大的方差,可以将cluster_std设置为[1.0,3.0],浮点数或者浮点数序列,默认值1.0
- center_box:中心确定之后的数据边界,默认值(-10.0, 10.0)
- shuffle :将数据进行洗乱,默认值是True
- random_state:官网解释是随机生成器的种子,可以固定生成的数据,给定数之后,每次生成的数据集就是固定的。若不给定值,则由于随机性将导致每次运行程序所获得的的结果可能有所不同。在使用数据生成器练习机器学习算法练习或python练习时建议给定数值。
1.2 手写Kmeans
K均值算法的4个步骤:
- 从所有样本中随机挑选k个样本质心作为初始簇的中心;
- 将每个样本分配给最近的质心;
- 更新质心,新的质心为已分配样本的中心(假设样本呢特征值是连续的);
- 重复步骤2和步骤3,直到每个样本的归属不再发生变化,或者迭代次数达到了用户定义的容差或最大迭代次数。
python
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
def plot_data(X, y, centroids, t):
plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='red', s=150)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.title('t={}'.format(t))
plt.tight_layout()
plt.savefig('./fig/t={}.png'.format(t))
plt.show()
return
def distEclud(arrA, arrB):
d = arrA - arrB
dist = np.sum(np.power(d, 2), axis=1)
return pow(dist, 0.5)
def randCent(dataSet, k):
n = dataSet.shape[1]
data_min = dataSet.iloc[:, :n].min()
data_max = dataSet.iloc[:, :n].max()
data_cent = np.random.uniform(data_min, data_max, (k, n))
return data_cent
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m, n = dataSet.shape
centroids = createCent(dataSet, k)
clusterAssment = np.zeros((m, 3))
clusterAssment[:, 0] = np.inf
clusterAssment[:, 1:3] = -1
result_set = pd.concat([dataSet, pd.DataFrame(clusterAssment)], axis=1, ignore_index=True)
clusterChanged = True
time = 0
plot_data(dataSet.iloc[:, 0:n].values, result_set.iloc[:, -1].values, centroids, time)
while clusterChanged:
for i in range(m):
dist = distMeas(dataSet.iloc[i, :n].values, centroids)
result_set.iloc[i, n] = dist.min()
result_set.iloc[i, n + 1] = np.where(dist == dist.min())[0]
clusterChanged = not (result_set.iloc[:, -1] == result_set.iloc[:, -2]).all()
if clusterChanged:
cent_df = result_set.groupby(n + 1).mean()
centroids = cent_df.iloc[:, :n].values
result_set.iloc[:, -1] = result_set.iloc[:, -2]
time = time + 1
plot_data(result_set.iloc[:, 0:n].values, result_set.iloc[:, -1].values, centroids, time)
if time == 1000:
break
return centroids, result_set
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
dataSet = X
dataSet = pd.DataFrame(dataSet)
centroids, result_set = kMeans(dataSet, 3, distMeas=distEclud, createCent=randCent)
代码说明
在执行k-means的时候,需要不断的迭代质心,因此需要两个可迭代容器来完成该目标:
第一个容器用于存放和更新质心,该容器可考虑使用数组来执行,数组不仅可迭代对象,同时数组内不同元素索引位置也可
用于标记和区分各质心,即各簇的编号。
第二个容器泽需要记录、保存和更新各点到质心之间的距离,并能够方便对其进行比较,该容器可以使用一个三列的数组来
执行,其中第一列用于存放最近一次完成后某点到各质心的最短距离,第二列用于存放迭代后根据最短距离得到的代表对应
质心的数值索引,即所属簇,第三类用于存放上一次迭代后的所属簇,后两列用于比较所属簇是否发生变化,确定迭代结束。
1.3 SSE计算
k均值是一种迭代方法,用于最小化簇内误差平方和(Sum of Sqquared Erroe, SSE)。误差平方和有时也被称为簇惯性(cluster inertia),定义如下,
S S E = ∑ i = 1 n ∑ j = 1 k w ( i , j ) ∥ x ( i ) − u ( j ) ∥ 2 2 {\rm{SSE}} = \sum\limits_{i = 1}^n {\sum\limits_{j = 1}^k {{w^{(i,j)}}\left\| {{x^{(i)}} - {u^{(j)}}} \right\|_2^2} } SSE=i=1∑nj=1∑kw(i,j) x(i)−u(j) 22
其中 μ ( j ) {\mu ^{(j)}} μ(j)为簇j的质心。如果样本 x ( i ) {x^{(i)}} x(i)在簇j中,则 w ( i , j ) = 1 {w^{(i,j)}} = 1 w(i,j)=1,否则 w ( i , j ) = 0 {w^{(i,j)}} = 0 w(i,j)=0:
python
"""
函数功能:聚类学习曲线
参数说明:
dataSet:原始数据集
cluster:K-means聚类方法
k:簇的个数
返回:误差平方和SSE
"""
def kcLearningCurve(dataSet, cluster = kMeans,k = 10):
n = dataSet.shape[1]
SSE = []
for i in range(1,k):
centroids,result_set = cluster(dataSet,i+1)
SSE.append(result_set.iloc[:,n].sum())
plt.plot(range(2,k+1),SSE,"--o")
return SSE
1.4 手写Kmeans++
python
import math
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
def plot_data(X, y, centroids, t):
plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='red', s=150)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid()
plt.title('t={}'.format(t))
plt.tight_layout()
plt.savefig('./fig2/t={}.png'.format(t))
plt.show()
return
def distEclud(arrA, arrB):
d = arrA - arrB
dist = np.sum(np.power(d, 2), axis=1)
return pow(dist, 0.5)
def euler_distance(point1, point2):
"""
计算两点之间的欧式距离,支持多维
"""
distance = 0.0
for a, b in zip(point1, point2):
distance += math.pow(a - b, 2)
return math.sqrt(distance)
# 计算最小距离
def get_closest_dist(point, centroids):
min_dist = math.inf # 初始设为无穷大
for i, centroid in enumerate(centroids):
dist = euler_distance(centroid, point)
if dist < min_dist:
min_dist = dist
return min_dist
def kpp_centers(data_set, k):
"""
从数据集中返回 k 个对象可作为质心
"""
cluster_centers = []
cluster_centers.append(random.choice(data_set))
d = [0 for _ in range(len(data_set))]
for _ in range(1, k):
total = 0.0
for i, point in enumerate(data_set):
d[i] = get_closest_dist(point, cluster_centers) # 与最近一个聚类中心的距离
total += d[i]
total *= random.random()
for i, di in enumerate(d): # 轮盘法选出下一个聚类中心;
total -= di
if total > 0:
continue
cluster_centers.append(data_set[i])
break
return np.array(cluster_centers)
def kMeans(dataSet, k, distMeas=distEclud, createCent=kpp_centers):
m, n = dataSet.shape
centroids = createCent(dataSet.values, k)
clusterAssment = np.zeros((m, 3))
clusterAssment[:, 0] = np.inf
clusterAssment[:, 1:3] = -1
result_set = pd.concat([dataSet, pd.DataFrame(clusterAssment)], axis=1, ignore_index=True)
clusterChanged = True
time = 0
plot_data(dataSet.iloc[:, 0:n].values, result_set.iloc[:, -1].values, centroids, time)
while clusterChanged:
for i in range(m):
dist = distMeas(dataSet.iloc[i, :n].values, centroids)
result_set.iloc[i, n] = dist.min()
result_set.iloc[i, n + 1] = np.where(dist == dist.min())[0]
clusterChanged = not (result_set.iloc[:, -1] == result_set.iloc[:, -2]).all()
if clusterChanged:
cent_df = result_set.groupby(n + 1).mean()
centroids = cent_df.iloc[:, :n].values
result_set.iloc[:, -1] = result_set.iloc[:, -2]
time = time + 1
plot_data(result_set.iloc[:, 0:n].values, result_set.iloc[:, -1].values, centroids, time)
if time == 1000:
break
return centroids, result_set
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
dataSet = X
dataSet = pd.DataFrame(dataSet)
centroids, result_set = kMeans(dataSet, 3, distMeas=distEclud, createCent=kpp_centers)
Kmeans++的原理可参考:https://blog.csdn.net/kuwola/article/details/124533036
1.5 Scikit-Learn实现k均值聚类
python
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
km = KMeans(n_clusters=3,
init="random",
n_init=10,
max_iter=300,
tol=1e-4,
verbose=0,
random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], s=50, c='lightgreen', marker='s', edgecolors='black', label='Cluster 1')
plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], s=50, c='orange', marker='o', edgecolors='black', label='Cluster 2')
plt.scatter(X[y_km == 2, 0], X[y_km == 2, 1], s=50, c='lightblue', marker='v', edgecolors='black', label='Cluster 3')
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=250, c='lightblue', marker='*', edgecolors='red',
label='Centroids')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.savefig('./fig/k means.png')
plt.show()
1.6 Scikit-Learn实现kMeans++均值聚类
在Scikit-Learn的KMeans对象上使用k均值++算法,只需将参数init设置成,'k-means++'即可。
'k-means++'是参数init的默认值,实际应用中推荐使用。
1.7 Scikit-Learn 肘方法
python
import os
os.environ["OMP_NUM_THREADS"] = "1"
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
distortions = []
for i in range(1, 11):
km = KMeans(n_clusters=i,
init='k-means++',
n_init=10,
max_iter=300,
random_state=0)
km.fit(X)
distortions.append(km.inertia_)
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.savefig('./fig2/a.png', dpi=300)
plt.show()
1.8 Scikit-Learn 轮廓分析
python
import numpy as np
from matplotlib import cm, pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
km = KMeans(n_clusters=3,
init='k-means++',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
edgecolor='none', color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2.)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.savefig('./fig/b.png', dpi=300)
plt.show()