大数据导论-大数据分析——沐雨先生

【实验目的】

掌握Pthon/R语言进行大数据分析,包括分类任务和聚类任务。掌握kNN、决策树、SVM分类器、kmeans聚类算法的Python或R语言编程方法。

【实验内容】

使用Python或R语言完成大数据分析任务

1、使用kNN、决策树、SVM模型,对iris数据集进行分类

2、使用kmeans聚类算法对iris数据集进行聚类

  • Python导入iris数据集方法
python 复制代码
from sklearn.datasets import load_iris
iris=load_iris()
attributes=iris.data #获取属性数据
#获取类别数据,这里注意的是已经经过了处理,target里0、1、2分别代表三种类别
target=iris.target
labels=iris.feature_names#获取类别名字
print(labels)
print(attributes)
print(target)
  • R语言导入iris数据集
r 复制代码
data("iris")
summary(iris)

我选择使用Python语言完成实验。

1.kNN算法

python 复制代码
import random

import numpy as np
import operator
from sklearn.datasets import load_iris

iris = load_iris()
attributes=iris.data
target=iris.target
labels = iris.feature_names

f1 = attributes.tolist()
f2 = target.tolist()
i=0
dataset=[]
while i < len(attributes):
    f1[i].append(f2[i])
    dataset.append(f1[i])
    i = i+1
library = []
n = int(len(f1)*0.3)
samples = random.sample(f1, n)
for x in dataset:
    if x not in samples:
        library.append(x);

def createDataSet():
    #四组二维特征
    group = np.array(library)
    #四组特征的标签
    labels = f2
    return group, labels

def classify0(inX, dataSet, labels, k):
    '''
        :param inX: 测试样本(arr)
        :param dataSet: 训练数据集(arr)
        :param labels: 类别(list)
        :param k:(int)
        :return: 类别
    '''
    #计算距离
    dataSetSize = dataSet.shape[0]  # 样本数量
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet #tile(inX{数组},(dataSetSize{倍数},1{竖向})):将数组(inX)竖向(1)复制dataSetSize倍
    sqDiffMat = diffMat ** 2                        #先求平方
    sqDistances = sqDiffMat.sum(axis=1)             #再求平方和
    distances = sqDistances ** 0.5                  #开根号,欧式距离
    sortedDistIndicies = distances.argsort()  #距离从小到大排序的索引
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]  #用索引得到相应的类别
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    return max(classCount, key=lambda k: classCount[k])  # 返回频数最大的类别

if __name__ == '__main__':
    #创建数据集
    group, labels = createDataSet()
    #测试集
    i=0;
    while i<len(samples):
        test_class = classify0(samples[i], group, labels, 3)
        print("测试用例:",samples[i],"所属类别: ",test_class)
        i+=1
    #打印分类结果

2.决策树算法

python 复制代码
# tree.py
import copy
import random
from sklearn.datasets import load_iris

# 找到出现次数最多的分类名称
import operator
# 计算给定数据集的熵
from math import log


def calShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    # 为所有可能的分类创建字典
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        # 计算熵,先求p
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt


iris = load_iris()
attributes=iris.data
target=iris.target
labels = iris.feature_names
labels1=copy.deepcopy(labels)

f1 = attributes.tolist()
f2 = target.tolist()
i=0
dataset=[]
while i < len(attributes):
    f1[i].append(f2[i])
    dataset.append(f1[i])
    i = i+1

library = []
n = int(len(f1)*0.3)
samples = random.sample(dataset,n)
for x in dataset:
    if x not in samples:
        library.append(x)

while i<len(samples):
    del samples[i][4]
    i+=1



# 构造数据集
def creatDataSet():
    dataSet1 = library
    labels1 = labels

    return dataSet1, labels1


# 根据属性及其属性值划分数据集
def splitDataSet(dataSet, axis, value):
    '''dataSet : 待划分的数据集
        axis : 属性及特征
        value : 属性值及特征的hasattr值'''
    retDataSet = []
    for featVet in dataSet:
        if featVet[axis] == value:
            reducedFeatVec = featVet[:axis]
            reducedFeatVec.extend(featVet[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


# 选择最好的数据集划分方式,及根绝信息增益选择划分属性
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calShannonEnt(dataSet)
    bestInfoGain, bestFeature = 0, -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        # 计算每种划分方式的信息熵
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature




def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
        sortedClassCount = sorted(
            classCount.items(), key=operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]


# 创建树的函数
def creatTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    # 类别完全相同停止划分
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        sublabels = labels[:]
        myTree[bestFeatLabel][value] = creatTree(
            splitDataSet(dataSet, bestFeat, value), sublabels)
    return myTree

def classify(inputTree,featLabels,testVec):
    global classLabel
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key],featLabels,testVec)
            else: classLabel = secondDict[key]
    return classLabel



if __name__ == '__main__':
    myData, labels = creatDataSet()

    print("数据集:{}\n 标签:{}".format(myData, labels))
    print("该数据集下的香农熵为:{}".format(calShannonEnt(myData)))
    #print("划分前的数据集:{}\n \n按照"离开水是否能生存"为划分属性,得到下一层待划分的结果为:\n{}--------{}".format(myData, splitDataSet(myData, 0, 0),
                                                                                #splitDataSet(myData, 0, 1)))
    chooseBestFeatureToSplit(myData)
    myTree = creatTree(myData, labels)

    i=0
    print("决策树:",myTree)
    while (i < len(samples)):
        f = classify(myTree, labels1, samples[i])
        print("测试用例:", samples[i], "测试结果: ", f)
        i = i + 1


{'petal length (cm)': {1.7: 0, 1.4: 0, 1.6: 0, 1.3: 0, 1.5: 0, 1.1: 0, 1.2: 0, 1.0: 0, 1.9: 0, 4.7: 1,
                       4.5:  {'sepal length (cm)': {4.9: 2, 5.6: 1, 6.0: 1, 5.7: 1, 6.4: 1, 6.2: 1, 5.4: 1}},
                       4.9: {'sepal width (cm)': {2.5: 1, 3.0: 2, 3.1: 1, 2.8: 2, 2.7: 2}}, 4.0: 1,
                       5.0: {'sepal length (cm)': {6.3: 2, 5.7: 2, 6.7: 1, 6.0: 2}}, 6.0: 2, 3.5: 1, 3.0: 1, 4.6: 1, 4.4: 1, 4.1: 1,
                       5.1: {'sepal length (cm)': {5.8: 2, 6.9: 2, 6.3: 2, 6.0: 1, 6.5: 2, 5.9: 2}}, 5.9: 2, 5.6: 2, 5.5: 2, 5.4: 2, 6.6: 2, 6.1: 2, 6.9: 2, 6.4: 2, 3.6: 1, 3.3: 1, 3.8: 1, 3.7: 1, 4.2: 1,
                       4.8: {'sepal length (cm)': {6.0: 2, 5.9: 1, 6.8: 1, 6.2: 2}}, 4.3: 1, 5.8: 2, 5.3: 2, 5.7: 2, 5.2: 2, 6.3: 2, 6.7: 2, 3.9: 1}}

3.SVM算法

python 复制代码
from sklearn.datasets import load_iris
from sklearn import svm
import numpy as np
from sklearn import model_selection
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors

iris = load_iris()
attributes = iris.data  # 获取属性数据 X
# 获取类别数据,这里注意的是已经经过了处理,target里0、1、2分别代表三种类别
target = iris.target  # Y
labels = iris.feature_names  # 获取类别名字
print(labels)
print(attributes)

x = attributes[:, 0:2]
y = target
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=1, test_size=0.3)

clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)

acc = clf.predict(x_train) == y_train.flat
print('Accuracy:%f' % (np.mean(acc)))

# print("SVM-训练集的准确率:", clf.score(x_train, y_train))
# # y_hat = clf.predict(x_train)
#
# print("SVM-测试集的准确率:", clf.score(x_test, y_test))
# # y_hat = clf.predict(x_test)

x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)

print("grid_test = \n", grid_test)
grid_hat = clf.predict(grid_test)
print("grid_hat = \n", grid_hat)
grid_hat = grid_hat.reshape(x1.shape)

# mpl.rcParams['font.sans-serif'] = [u'SimHei']
# mpl.rcParams['axes.unicode_minus'] = False

cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
# cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])

plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
plt.plot(x[:, 0], x[:, 1], 'o', alpha=0.5, color='blue', markeredgecolor='k')
plt.scatter(x_test[:, 0], x_test[:, 1], s=120, facecolors='none', zorder=10)
plt.xlabel(labels[0])
plt.ylabel(labels[1])
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title("SVM")
plt.show()

4.Kmeans算法

python 复制代码
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

iris = load_iris()
attributes = iris.data  # 获取属性数据 X
# 获取类别数据,这里注意的是已经经过了处理,target里0、1、2分别代表三种类别
target = iris.target  # y
labels = iris.feature_names  # 获取类别名字
print(labels)
print(attributes.shape)
print(attributes)
print(target)

plt.style.use('seaborn')  # 样式美化

x = attributes[:, 0:2]
y = target
plt.scatter(attributes[:, 0], attributes[:, 1], s=50, marker='o', label='see')
plt.xlabel(labels[0])
plt.ylabel(labels[1])
plt.show()

est = KMeans(n_clusters=3)  # 选择聚为 x 类
est.fit(attributes)
y_kmeans = est.predict(attributes)  # 预测类别,输出为含0、1、2、3数字的数组
x0 = attributes[y_kmeans == 0]
x1 = attributes[y_kmeans == 1]
x2 = attributes[y_kmeans == 2]

# 为预测结果上色并可视化
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()

plt.scatter(x0[:, 0], x0[:, 1], s=50, c="red", marker='o', label='label0', cmap='viridis')
plt.scatter(x1[:, 0], x1[:, 1], s=50, c="green", marker='*', label='label1', cmap='viridis')
plt.scatter(x2[:, 0], x2[:, 1], s=50, c="blue", marker='+', label='label2', cmap='viridis')
plt.xlabel(labels[0])
plt.ylabel(labels[1])
centers = est.cluster_centers_  # 找出中心
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)  # 绘制中心点
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title("kmeans")
plt.legend(loc=2)
plt.show()
相关推荐
2501_9436953339 分钟前
大专市场调查与统计分析专业,怎么辨别企业招聘的“画饼”岗位?
大数据
七夜zippoe43 分钟前
CANN Runtime跨进程通信 共享设备上下文的IPC实现
大数据·cann
威胁猎人1 小时前
【黑产大数据】2025年全球电商业务欺诈风险研究报告
大数据
L543414461 小时前
告别代码堆砌匠厂架构让你的系统吞吐量翻倍提升
大数据·人工智能·架构·自动化·rpa
证榜样呀1 小时前
2026 大专计算机专业必考证书推荐什么
大数据·前端
LLWZAI1 小时前
让朱雀AI检测无法判断的AI公众号文章,当创作者开始与算法「躲猫猫」
大数据·人工智能·深度学习
实时数据2 小时前
一手资料结合大数据分析挖掘海量信息中的价值了解用户真实需求 实现精准营销
数据挖掘·数据分析
SickeyLee2 小时前
产品经理案例分析(五):电商产品后台设计:撑起前台体验的 “隐形支柱”
大数据
龙腾AI白云2 小时前
面向开放世界的具身智能泛化能力探索
数据挖掘
callJJ2 小时前
Spring AI 文本聊天模型完全指南:ChatModel 与 ChatClient
java·大数据·人工智能·spring·spring ai·聊天模型