BP神经网络公式推导与代码实现

Introduction

传统工科er,大一上数学建模课就听说了BP神经网络这玩意儿,后来参加数模竞赛也经常用它。但一直是通过调用第三方包的方式来使用,神经网络对我而言简直就是黑盒中的黑盒。这回趁着毕业论文又要用它,索性学下原理,或许是考研的缘故,把数学基础给补了补,居然花了一两天时间就大概清楚BP神经网络咋运作的了... ok,拉回正题。BP神经网络(Backpropagation Neural Network,误差反向传播神经网络)是一种经典的监督学习算法,广泛应用于分类、回归等任务。它的核心是通过反向传播算法动态调整网络参数(权重和偏置),使网络输出逐渐逼近真实值。本文首先从一个三层结构的BP神经网络出发,推导出三层结构BP神经网络的相关计算公式,然后再推广到任意结构的BP神经网络,最后以经典的手写数字识别为例利用Numpy从头搭建一个神经网络。

Formula derivation

前向传播过程的相关公式是很显然的,反向传播算法相关公式推导中只会用到最基础的矩阵和向量求导的知识,这部分可以参考reference中的第一篇文章,讲的很清楚。

Code implementation

load_mnist文件和MNIST数据集合下载:pan.baidu.com/s/1cfM92iI5...

python 复制代码
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 14 19:56:49 2025
@author: syaunsheng
"""
import numpy as np
from MNIST.load_mnist import load_mnist
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=['Times New Roman']

class Bpnn():
    def __init__(self,structure,activation=None):
        self.fdict = {'sigmoid':self.sigmoid}
        self.dfdict = {'sigmoid':self.dsigmoid}
        if activation is None:
            activation = {'acfun_'+str(i+1):"sigmoid" for i in range(1,len(structure))} # 默认使用sigmoid函数作为各层的激活函数
        assert len(structure)-1 == len(activation),'check the length of strucure and activation.'
        self.structure = structure
        self.activation = activation
        self.wdict = {'w_'+str(i+1):np.random.randn(self.structure[i],self.structure[i-1]) 
                      for i in range(1,len(self.structure))} # 初始化网络权重矩阵
        self.bdict = {'b_'+str(i+1):np.random.randn(self.structure[i],1)
                      for i in range(1,len(self.structure))} # 初始化网络偏置向量
        self.zdict = {'z_'+str(i+1):None for i in range(1,len(self.structure))} # 初始化网络状态值向量
        self.adict = {'a_'+str(i+1):None for i in range(1,len(self.structure))} # 初始化网络激活值向量
        self.deltadict = {'delta_'+str(i+1):None for i in range(1,len(self.structure))} # 初始化网络delta值向量
        self.pdwdict = {'pdw_'+str(i+1):None for i in range(1,len(self.structure))} # 初始化网络对权重的偏导数矩阵
        self.pdbdict = {'pdb_'+str(i+1):None for i in range(1,len(self.structure))} # 初始化网络对偏置的偏导数矩阵
        self.loss = [] # 记录每个批次的平均损失
        
    def forward(self,x):
        assert x.ndim == 2,"x's dimmension must be 2."
        assert x.shape[1] == 1,"x's must be a colomn vector."
        # 前向传播算输出
        for i in range(1,len(self.structure)):
            # 计算第i+1层的状态值向量
            if i == 1:
                self.zdict['z_'+str(i+1)] = (np.dot(self.wdict['w_'+str(i+1)],
                                                    x)+self.bdict['b_'+str(i+1)]) # a_1就是x
            else:
                self.zdict['z_'+str(i+1)] = (np.dot(self.wdict['w_'+str(i+1)],
                                                    self.adict['a_'+str(i)])+self.bdict['b_'+str(i+1)])
            # 计算第i+1层的激活值向量
            self.adict['a_'+str(i+1)] = self.fdict[self.activation['acfun_'+str(i+1)]](self.zdict['z_'+str(i+1)])
    
    def backward(self,x,y):
        assert x.ndim == 2,"x's dimmension must be 2."
        assert x.shape[1] == 1,"x's must be a colomn vector."
        assert y.ndim == 2,"y's dimmension must be 2."
        assert y.shape[1] == 1,"y's must be a colomn vector."
        # 反向传播算偏导数
        for i in range(len(self.structure),1,-1):
            if i == len(self.structure):
                self.deltadict['delta_'+str(i)] = (-(y-self.fdict[self.activation['acfun_'+str(i)]](self.zdict['z_'+str(i)]))
                                                   *self.dfdict[self.activation['acfun_'+str(i)]](self.zdict['z_'+str(i)]))
            else:
                self.deltadict['delta_'+str(i)] = (np.dot(self.wdict['w_'+str(i+1)].T,self.deltadict['delta_'+str(i+1)])
                                                   *self.dfdict[self.activation['acfun_'+str(i)]](self.zdict['z_'+str(i)]))
            if i >= 3: 
                self.pdwdict['pdw_'+str(i)] = np.dot(self.deltadict['delta_'+str(i)],self.adict['a_'+str(i-1)].T)
            else:
                self.pdwdict['pdw_'+str(i)] = np.dot(self.deltadict['delta_'+str(i)],x.T)
            self.pdbdict['pdb_'+str(i)] = self.deltadict['delta_'+str(i)]
    
    def train(self,x_train,y_train,max_iter=1000,batch_size=32,learning_rate=1e-2):
        counter = 0
        for iter_time in range(max_iter):
            pos = np.random.randint(x_train.shape[0],size=batch_size)
            batch_x,batch_y = x_train[pos,:],y_train[pos,:] # 生成用于本次迭代的batch
            memory = [] # 记录每个样本对于的(zdict,adict,pdwdict,pdbdict)
            memory_loss = [] # 记录每个样本的损失
            for x,y in zip(batch_x,batch_y):
                self.forward(x.reshape(-1,1))
                self.backward(x.reshape(-1,1), y.reshape(-1,1))
                memory.append((self.zdict,self.adict,self.pdwdict,self.pdbdict))
                memory_loss.append(self.mse(self.adict['a_'+str(len(self.structure))], y.reshape(-1,1)))
            self.loss.append(np.mean(memory_loss))
            # 梯度下降参数更新
            for i in range(1,len(self.structure)):
                sum_pdw = np.zeros_like(self.pdwdict['pdw_'+str(i+1)])
                sum_pdb = np.zeros_like(self.pdbdict['pdb_'+str(i+1)])
                for _,_,pdw,pdb in memory:
                    sum_pdw += pdw['pdw_'+str(i+1)]
                    sum_pdb += pdb['pdb_'+str(i+1)]
                self.wdict['w_'+str(i+1)] -= learning_rate*sum_pdw
                self.bdict['b_'+str(i+1)] -= learning_rate*sum_pdb  
            # 打印求解状态
            if iter_time % (max_iter//100) == 0:
                counter += 1
                print('[{}/{}]:loss={}'.format(iter_time,max_iter,np.mean(memory_loss)))
                
        
    def predict(self,x):
        assert x.ndim == 2,"x's dimmension must be 2."
        assert x.shape[1] == 1,"x's must be a colomn vector."
        self.forward(x)
        res = self.adict['a_'+str(len(self.structure))] # 获取预测结果,再转化
        return np.where(res == res[np.argmax(res),0],1,0) 
        
    def accuracy(self,x_test,y_test):
        # 测试接口
        acc = 0
        for x,y in zip(x_test,y_test):
           if np.all(self.predict(x.reshape(-1,1)) == y.reshape(-1,1)):
               acc += 1
        return acc/len(y_test)
                
                
    def sigmoid(self,z):
        # 解决了溢出问题
        z_ravel = z.ravel()  # 将numpy数组展平
        y = []
        for item in z_ravel:
            if item >= 0:
                y.append(1.0 / (1 + np.exp(-item)))
            else:
                y.append(np.exp(item) / (np.exp(item) + 1))
        return np.array(y).reshape(z.shape)
    
    def dsigmoid(self,z):
        # sigmoid函数的导数
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    def mse(self,pred,y):
        """均方损失函数"""
        return np.sum(1/2*(y-pred)**2)

if __name__ == '__main__':
    dataset = load_mnist() 
    # 查看一下图像信息,看读取是否出错,再x_train中每一行应该都对应一张28x28像素的图像
    plt.close('all')
    fig,ax = plt.subplots(3,4)
    x_train,y_train,x_test,y_test = (dataset['x_train'],dataset['y_train'],
                                     dataset['x_test'],dataset['y_test'])
    for i in range(3):
        for j in range(4):
            ax[i,j].imshow(x_train[i*j,:].reshape(28,28),cmap='grey') 
    # 模型训练
    model = Bpnn([784,50,20,10])
    model.train(x_train, y_train,max_iter=10000)
    fig,ax = plt.subplots(figsize=(7,4))
    ax.plot(model.loss,color='blue')
    ax.set_xlabel('iteration')
    ax.set_ylabel('loss')
    # 精度测试
    print('模型准确率:{}'.format(model.accuracy(x_test, y_test)))

模型精度约为0.82,损失函数变化情况见下图。

Reference

  1. 科技猛兽:机器学习中的数学理论1:三步搞定矩阵求导
  2. BP(Back Propagation)神经网络--原理篇
相关推荐
今天吃什么了19 分钟前
计算两个矩阵的乘积
线性代数·算法·矩阵
HR Zhou44 分钟前
群体智能优化算法-䲟鱼优化算法 (Remora Optimization Algorithm, ROA,含Matlab源代码)
开发语言·算法·matlab·优化·智能优化算法·群体智能优化
Joyner20181 小时前
python-leetcode-定长子串中元音的最大数目
算法·leetcode·职场和发展
Tisfy1 小时前
LeetCode 2272.最大波动的子字符串:转为多次的最大子数组和 - 一步步思考推导
算法·leetcode·动态规划·字符串·题解
查理零世1 小时前
【算法】 区间合并(附蓝桥杯真题) python
python·算法·蓝桥杯
Codingwiz_Joy1 小时前
Day09 -实例:拿到加密密文进行解密
算法·安全·安全性测试
float_六七2 小时前
双指针算法
算法
BingLin-Liu2 小时前
图论之cruskal算法(克鲁斯卡尔)
数据结构·算法·图论
进取星辰2 小时前
PyTorch 深度学习实战(15):Twin Delayed DDPG (TD3) 算法
pytorch·深度学习·算法
strive-debug3 小时前
C语言之 条件编译和预处理指令
算法