def calulateClassPriorProb(dataset, dataset_info):
"""
calculate every class's prior probability
:param dataset: train data with list type
:param dataset_info: Number of samples per class(category)
:return: dict type with every class's prior probability
"""
dataset_prior_prob = {}
sample_sum = len(dataset)
for class_value, sample_nums in dataset_info.items():
dataset_prior_prob[class_value] = sample_nums / float(sample_sum)
return dataset_prior_prob
def mean(number_list):
number_list = [float(x) for x in number_list] # str to number
return sum(number_list) / float(len(number_list))
def var(number_list):
number_list = [float(x) for x in number_list]
avg = mean(number_list)
var = sum([math.pow((x - avg), 2) for x in number_list]) / float(len(number_list))
return var
# 计算每个属性的均值和方差
def summarizeAttribute(dataset):
"""
calculate mean and var of per attribution in one class
:param dataset: train data with list type
:return: len(attribution)'s tuple ,that's (mean,var) with per attribution
"""
dataset = np.delete(dataset, -1, axis=1) # delete label
# zip函数将数据样本按照属性分组为一个个列表,然后可以对每个属性计算均值和标准差。
summaries = [(mean(attr), var(attr)) for attr in zip(*dataset)]
return summaries
# 按类别提取数据特征
def summarizeByClass(dataset_separated):
"""
calculate all class with per attribution
:param dataset_separated: data list of per class
:return: num:len(class)*len(attribution)
{class1:[(mean1,var1),(),...],class2:[(),(),...]...}
"""
summarize_by_class = {}
for classValue, vector in dataset_separated.items():
summarize_by_class[classValue] = summarizeAttribute(vector)
return summarize_by_class #返回的是某类别各属性均值方差的列表
def calculateClassProb(input_data, train_Summary_by_class):
"""
calculate class conditional probability through multiply
every attribution's class conditional probability per class
:param input_data: one sample vectors
:param train_Summary_by_class: every class with every attribution's (mean,var)
:return: dict type , class conditional probability per class of this input data belongs to which class
"""
prob = {}
p = 1
for class_value, summary in train_Summary_by_class.items():
prob[class_value] = 1
for i in range(len(summary)):
mean, var = summary[i]
x = input_data[i]
exponent = math.exp(math.pow((x - mean), 2) / (-2 * var))
p = (1 / math.sqrt(2 * math.pi * var)) * exponent
prob[class_value] *= p
return prob
4) 计算后验概率并将待分类样本归类到后验概率最大的类别中
主函数中使用
python复制代码
# 下面对测试集进行预测
correct = 0 # 预测的准确率
for vector in testset:
input_data = vector[:-1]
label = vector[-1]
prob = calculateClassProb(input_data, summarize_by_class)
result = {}
for class_value, class_prob in prob.items():
p = class_prob * prior_prob[class_value]
result[class_value] = p
type = max(result, key=result.get)
print(vector)
print(type)
if type == label:
correct += 1
print("predict correct number:{}, total number:{}, correct ratio:{}".format(correct, len(testset), correct / len(testset)))
5、手动实现(整体代码)
python复制代码
# 导入需要用到的库
import pandas as pd
import numpy as np
import random
import math
# 载入数据集
def loadData(filepath):
"""
:param filepath: csv
:return: list
"""
data_df = pd.read_csv(filepath)
data_list = np.array(data_df) # 将pandas DataFrame转换成Numpy的数组再转换成列表
data_list = data_list.tolist()
print("Loaded {0} samples successfully.".format(len(data_list)))
return data_list
# 划分训练集与测试集
def splitData(data_list, ratio):
"""
:param data_list:all data with list type
:param ratio: train date's ratio
:return: list type of trainset and testset
"""
train_size = int(len(data_list) * ratio)
random.shuffle(data_list) #随机打乱列表元素
trainset = data_list[:train_size]
testset = data_list[train_size:]
return trainset, testset
# 按类别划分数据
def seprateByClass(dataset):
"""
:param dataset: train data with list type
:return: seprate_dict:separated data by class;
info_dict:Number of samples per class(category)
"""
seprate_dict = {}
info_dict = {}
for vector in dataset:
if vector[-1] not in seprate_dict:
seprate_dict[vector[-1]] = []
info_dict[vector[-1]] = 0
seprate_dict[vector[-1]].append(vector)
info_dict[vector[-1]] += 1
return seprate_dict, info_dict
# 计算先验概率
def calulateClassPriorProb(dataset, dataset_info):
"""
calculate every class's prior probability
:param dataset: train data with list type
:param dataset_info: Number of samples per class(category)
:return: dict type with every class's prior probability
"""
dataset_prior_prob = {}
sample_sum = len(dataset)
for class_value, sample_nums in dataset_info.items():
dataset_prior_prob[class_value] = sample_nums / float(sample_sum)
return dataset_prior_prob
# 计算均值的函数
def mean(number_list):
number_list = [float(x) for x in number_list] # str to number
return sum(number_list) / float(len(number_list))
# 计算方差的函数
def var(number_list):
number_list = [float(x) for x in number_list]
avg = mean(number_list)
var = sum([math.pow((x - avg), 2) for x in number_list]) / float(len(number_list))
return var
# 计算每个属性的均值和方差
def summarizeAttribute(dataset):
"""
calculate mean and var of per attribution in one class
:param dataset: train data with list type
:return: len(attribution)'s tuple ,that's (mean,var) with per attribution
"""
dataset = np.delete(dataset, -1, axis=1) # delete label
# zip函数将数据样本按照属性分组为一个个列表,然后可以对每个属性计算均值和标准差。
summaries = [(mean(attr), var(attr)) for attr in zip(*dataset)]
return summaries
# 按类别提取数据特征
def summarizeByClass(dataset_separated):
"""
calculate all class with per attribution
:param dataset_separated: data list of per class
:return: num:len(class)*len(attribution)
{class1:[(mean1,var1),(),...],class2:[(),(),...]...}
"""
summarize_by_class = {}
for classValue, vector in dataset_separated.items():
summarize_by_class[classValue] = summarizeAttribute(vector)
return summarize_by_class #返回的是某类别各属性均值方差的列表
# 计算条件概率
def calculateClassProb(input_data, train_Summary_by_class):
"""
calculate class conditional probability through multiply
every attribution's class conditional probability per class
:param input_data: one sample vectors
:param train_Summary_by_class: every class with every attribution's (mean,var)
:return: dict type , class conditional probability per class of this input data belongs to which class
"""
prob = {}
p = 1
for class_value, summary in train_Summary_by_class.items():
prob[class_value] = 1
for i in range(len(summary)):
mean, var = summary[i]
x = input_data[i]
exponent = math.exp(math.pow((x - mean), 2) / (-2 * var))
p = (1 / math.sqrt(2 * math.pi * var)) * exponent
prob[class_value] *= p
return prob
if __name__ == '__main__':
data_list = loadData('IrisData.csv')
trainset, testset = splitData(data_list, 0.7)
dataset_separated, dataset_info = seprateByClass(trainset)
summarize_by_class = summarizeByClass(dataset_separated)
prior_prob = calulateClassPriorProb(trainset, dataset_info)
# 下面对测试集进行预测
correct = 0 # 预测的准确率
for vector in testset:
input_data = vector[:-1]
label = vector[-1]
prob = calculateClassProb(input_data, summarize_by_class)
result = {}
for class_value, class_prob in prob.items():
p = class_prob * prior_prob[class_value]
result[class_value] = p
type = max(result, key=result.get)
print(vector)
print(type)
if type == label:
correct += 1
print("predict correct number:{}, total number:{}, correct ratio:{}".format(correct, len(testset), correct / len(testset)))
6、使用 sklearn 库实现
python复制代码
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 加载iris数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
# 初始化朴素贝叶斯分类器(这里使用高斯朴素贝叶斯)
gnb = GaussianNB()
# 使用训练集训练朴素贝叶斯分类器
gnb.fit(X_train, y_train)
# 使用测试集进行预测
y_pred = gnb.predict(X_test)
# 计算预测的准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))