数据分析与挖掘(三):掌握使用库函数进行数据分析的方法
一、实验目的及要求
掌握使用库函数进行数据分析的方法
掌握K-means算法的代码实现
对比不同聚类算法性能
二、实验设备(环境)及要求
Jupyter notebook、百度 AI studio
三、实验内容
1.使用库函数完成鸢尾花K-means聚类分析。
(1) 实现基本步骤
调用函数代码段:
np.random.seed(5)
iris = datasets.load_iris()
X = iris.data
y = iris.target
est = KMeans(n_clusters=3)
est.fit(X)
labels = est.labels_
显示结果代码段:
fig = plt.figure(1, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
ax.scatter(X[:, 3], X[:, 0], X[:, 2],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('花瓣宽度')
ax.set_ylabel('萼片长度')
ax.set_zlabel('花瓣长度')
ax.set_title("3类")
ax.dist = 12
plt.show()
(2) 运行(K-means聚类图)
2.使用库函数完成鸢尾花DBSCAN聚类分析。
(1) 实现基本步骤
调用函数代码段:
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0)
X = StandardScaler().fit_transform(X)
\# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
显示结果代码段:
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
\# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('估计类的数量: %d' % n_clusters_)
plt.show()
(2) 运行(DBSCAN聚类图)
3.实现K-means算法,完成鸢尾花聚类分析。
(1) 实现基本步骤
调用函数代码段:
def prepare_data():
iris = datasets.load_iris()
x = iris.data
return x
def k_means(x, k=3):
\#随机初始化质心
index_list = np.arange(len(x))
np.random.shuffle(index_list)
centroids_index = index_list[:k]
centroids = x[centroids_index]
\#初始化标签数组
y = np.arange(len(x))
iter_num = 10 #自行设置迭代次数
\#每次迭代计算新的将现有点分类,计算新的簇中心
for i in range(iter_num):
y_new = np.arange(len(x))
for i, xi in enumerate(x):
y_new[i] = np.argmin([np.linalg.norm(xi - cj) for cj in centroids])
if sum(y != y_new) == 0:
break
for j in range(k):
centroids[j] = np.mean(x[np.where(y_new == j)], axis=0)
y = y_new.copy()
return y
显示结果代码段:
if __name__ == '__main__':
x=prepare_data()
k_means(x, k=3)
\#可视化
for i in range(np.shape(y)[0]):
if y[i] == 0:
plt.scatter(x[i][0], x[i][1], c='b', s=20)
elif y[i] == 1:
plt.scatter(x[i][0], x[i][1], c='y', s=20)
else:
plt.scatter(x[i][0], x[i][1], c='g', s=20)
plt.show()
(2) 运行(K-means算法鸢尾花聚类分析图)
4.调用库函数完成鸢尾花聚类分析,对比K-means算法和层次聚类算法性能。
(1) 实现基本步骤
调用函数代码段:
def eva_kmeans(x,y):
\#运行十次,取平均分类正确样本数,运行时间,准确率
kmean_ei = 0.0 #分类正确样本数
kmean_rt = 0.0 #运行时间
kmean_aa = 0.0 #准确率
for i in range(0, iter):
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=1)
k_begin = time.time()
kmeans = KMeans(init="random", n_clusters=3, random_state=0).fit(x,y)
kmean_pred = kmeans.predict(x_test)
k_end = time.time() - k_begin
kmean_rt = kmean_rt + k_end
accuracy_number = 0
for i in range(len(y_test)):
if kmean_pred[i] == y_test[i]:
accuracy_number += 1
kmean_ei = kmean_ei + accuracy_number
accuracy_percentage = metrics.accuracy_score(y_test, kmean_pred)*100
kmean_aa = kmean_aa + accuracy_percentage
kmean_ei = kmean_ei / (iter*1.0)
kmean_rt = kmean_rt / (iter*1.0)
kmean_aa = kmean_aa / (iter*1.0)
return kmean_ei, kmean_rt, kmean_aa
\#层次聚类
def eva_hierarchical(x,y):
hier_ei = 0.0
hier_rt = 0.0
hier_aa = 0.0
for i in range(0, 10):
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=1)
k_begin = time.time()
hier = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="average").fit(x,y)
hier_pred = hier.fit_predict(x_test)
k_end = time.time() - k_begin
hier_rt = hier_rt + k_end
accuracy_number = 0
for i in range(len(y_test)):
if hier_pred[i] == y_test[i]:
accuracy_number += 1
hier_ei = hier_ei + accuracy_number
accuracy_percentage = metrics.accuracy_score(y_test, hier_pred)*100
hier_aa = hier_aa + accuracy_percentage
hier_ei = hier_ei / (iter*1.0)
hier_rt = hier_rt / (iter*1.0)
hier_aa = hier_aa / (iter*1.0)
return hier_ei, hier_rt, hier_aa
显示结果代码段:
if __name__ == '__main__':
x,y=prepare_data()
kmean_ei, kmean_rt, kmean_aa=eva_kmeans(x,y)
hier_ei, hier_rt, hier_aa = eva_hierarchical(x, y)
print ("total iterate:",iter)
print ("method ", "平均错误分类样本数量", "平均运算时间/s ", "平均错误比例/%")
print ("K-means", kmean_ei," ", kmean_rt," ", kmean_aa)
print ("hierarchical ", hier_ei, " ", hier_rt, " ", hier_aa)
(2) 运行(图 K-means和层次聚类算法对比)
四、实验结果分析以及出现问题
本次上机我们学习掌握了k均值算法和层次聚类基本原理, 并通过具体实验的对比来理解两种方法之间的联系和区别,利用 K-means 和层次聚类算法分别实现了 UCI 数据集下鸢尾花的聚类分析。通过所选取的度量参数,K-means 算法聚类准确率略高,层次聚类准确率较低,但速度更快。在对精度要求较高的聚类中,使用k 均值算法将会是更优的选择,而对结果需求急切时,则可以使用层次聚类算法,速度将比k均值算法快很多。通过这次上机,对聚类算法有了很多新的了解,通过实战来熟悉聚类算法,这将对以后的学习生活很有帮助。