摆烂...先写这么多,有空来更新(指的是矩阵求导部分懒得写现在)
写在最前面
本打算围绕《Learning representations by back-propagating errors》一文进行解读的
奈何没有中文版的文档(
笔者懒得翻译English)所以文章内容只能根据笔者自身对BP反向传播算法的理解来编写咯~😅
在论文的标题里写了back-propagating errors ,即反向传播误差
- 先通过前向传播计算得最终的输出结果
- 进而计算与正确值之间的误差
- 将误差反向传播,更新权重,得以实现"学习"的目的
- 至于...误差如何反向传播,权重如何更新,也是本文在探讨的
python
import random
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
定义激活函数、损失函数
这里使用ReLU、Sigmoid作为后期神经网络的激活函数,MSE作为损失函数
下面列了这仨的计算公式与导数的计算公式
ReLU
<math xmlns="http://www.w3.org/1998/Math/MathML"> R e L U = m a x ( 0 , x ) ReLU=max(0,x) </math>ReLU=max(0,x)
<math xmlns="http://www.w3.org/1998/Math/MathML"> d y d x = { 1 , x > 0 0 , x ≤ 0 \frac{\mathrm{d} y}{\mathrm{d} x} = \left\{\begin{matrix} 1 \quad ,x > 0\\ 0 \quad ,x \le 0 \end{matrix}\right. </math>dxdy={1,x>00,x≤0
Sigmoid
<math xmlns="http://www.w3.org/1998/Math/MathML"> S i g m o i d = 1 1 + e − x Sigmoid=\frac{1}{1+e^{-x}} </math>Sigmoid=1+e−x1
<math xmlns="http://www.w3.org/1998/Math/MathML"> d y d x = x ⋅ ( 1 − x ) \frac{\mathrm{d} y}{\mathrm{d} x} = x\cdot (1-x) </math>dxdy=x⋅(1−x)
MSE
<math xmlns="http://www.w3.org/1998/Math/MathML"> M S E = ∑ ( p r e d − t r u e ) 2 MSE=\sum (pred-true)^{2} </math>MSE=∑(pred−true)2
<math xmlns="http://www.w3.org/1998/Math/MathML"> d y d x = 2 ⋅ ∑ ( p r e d − t r u e ) \frac{\mathrm{d} y}{\mathrm{d} x} =2\cdot \sum (pred-true) </math>dxdy=2⋅∑(pred−true)
按照上面的公式,就可以写出下面所定义的激活函数与损失函数。但下面对于mse
的导数计算,实际上采取的计算公式是 <math xmlns="http://www.w3.org/1998/Math/MathML"> ∑ ( p r e d − t r u e ) \sum (pred-true) </math>∑(pred−true),并没有×2
python
# ReLU 激活函数
class relu:
def __call__(self, x):
return np.maximum(0, x)
# 对 ReLU 求导
def diff(self, x):
x_temp = x.copy()
x_temp[x_temp > 0] = 1
return x_temp
# Sigmoid 激活函数
class sigmoid:
def __call__(self, x):
return 1/(1+np.exp(-x))
# 对 Sigmoid 求导
def diff(self, x):
return x*(1-x)
# MSE 损失函数
class mse:
def __call__(self, true, pred):
return np.mean(np.power(pred-true, 2), keepdims=True)
# 对 MSE 求导
def diff(self, true, pred):
return pred-true
relu = relu()
sigmoid = sigmoid()
mse = mse()
实现简单的BP反向传播
约法6章:
- x:输入的值
- w:权重
- b:偏置
- true:我们要的正确值
- lr:学习率
- epochs:迭代次数
python
x = random.random()
w = random.random()
b = random.random()
true = 0.1
lr = 0.3
epochs = 520
# 用于记录 loss
loss_hisory = []
for epoch in range(epochs):
pred = sigmoid(w * x + b)
loss = mse(true, pred)
# 更新参数
w -= lr * x * sigmoid.diff(pred) * mse.diff(true, pred)
b -= lr * sigmoid.diff(pred) * mse.diff(true, pred)
if epoch % 100 == 0:
print(f'epoch {epoch}, loss={loss}, pred={pred}')
loss_hisory.append(loss)
print(f'epoch {epoch+1}, loss={loss}, pred={pred}')
# 绘制 loss 曲线图
plt.plot(loss_hisory)
plt.show()
==============================
输出:
epoch 0, loss=0.4547767889756994, pred=0.7743714028454197
epoch 100, loss=0.008687689715087746, pred=0.19320777711697532
epoch 200, loss=0.0015916314738394428, pred=0.13989525628241337
epoch 300, loss=0.0004858242195607016, pred=0.12204142054316604
epoch 400, loss=0.00017940513211277994, pred=0.11339422010095325
epoch 500, loss=7.284822668391603e-05, pred=0.10853511726245844
epoch 520, loss=6.180920014091377e-05, pred=0.10786188273512864

实现高级的BP反向传播
前面只是开胃菜,这里难度加加加!使用矩阵的方式,再实现一次
python
x = np.random.rand(1, 2)
w = np.random.rand(2, 3)
b = np.random.rand(1, 3)
true = np.array([[1., 0., 0.]])
lr = 0.1
epochs = 520
loss_hisory = []
for epoch in range(epochs):
pred = sigmoid(x@w+b)
loss = mse(true, pred)
w -= lr * x.T @ sigmoid.diff(pred)*mse.diff(true, pred)
b -= lr * sigmoid.diff(pred)*mse.diff(true, pred)
if epoch % 100 == 0:
print(f'epoch {epoch}, loss={loss}, pred={pred}')
loss_hisory.append(loss[0])
print(f'epoch {epoch + 1}, loss={mse(true, pred)}, pred={pred}')
# 绘制 loss 曲线图
plt.plot(loss_hisory)
plt.show()
==============================
输出:
epoch 0, loss=[[0.32000781]], pred=[[0.73586361 0.54378645 0.77107179]]
epoch 100, loss=[[0.08637794]], pred=[[0.81753325 0.27439642 0.38800297]]
epoch 200, loss=[[0.0355867]], pred=[[0.8556965 0.18657386 0.22611236]]
epoch 300, loss=[[0.02130164]], pred=[[0.87807622 0.14650927 0.16605589]]
epoch 400, loss=[[0.01497136]], pred=[[0.89299056 0.12331754 0.13511425]]
epoch 500, loss=[[0.01145747]], pred=[[0.90375678 0.10798084 0.11597327]]
epoch 520, loss=[[0.01096162]], pred=[[0.90547977 0.10562232 0.11311377]]

手搓神经网络
至此,重头戏来咯~🤯,要实现自动求导+矩阵求导
python
# 定义层
class Linear:
def __init__(self, inputs, outputs, activation):
'''
inputs: 输入神经元个数
outpus: 输出神经元个数
activation: 激活函数
'''
# 初始化 weight
self.weight = np.random.rand(inputs, outputs)
# 初始化 bias
self.bias = np.random.rand(1, outputs)
self.activation = activation
# 这里用作后期误差反向传播用
self.x_temp = None
self.t_temp = None
# 层前向计算
def __call__(self, x, parent):
self.x_temp = x
self.t_temp = self.activation([email protected]+self.bias)
parent.layers.append(self)
return self.t_temp
# 更新 weight、bias
def update(self, grad):
activation_diff_grad = self.activation.diff(self.t_temp) * grad
self.weight -= lr * self.x_temp.T @ activation_diff_grad
self.bias -= lr * activation_diff_grad
# 这里误差继续往前传
return activation_diff_grad @ self.weight.T
# 定义网络
class NetWork:
def __init__(self):
# 储存层,便于后期的更新
self.layers = []
# 定义各层
self.linear_1 = Linear(4, 16, activation=sigmoid)
self.linear_2 = Linear(16, 8, activation=sigmoid)
self.linear_3 = Linear(8, 4, activation=sigmoid)
# 模型计算
def __call__(self, x):
x = self.linear_1(x, self)
x = self.linear_2(x, self)
x = self.linear_3(x, self)
return x
# 模型训练
def fit(self, x, y, epochs, step=100):
for epoch in range(epochs):
pred = self(x)
self.backward(y, pred)
if epoch % step == 0:
print(f'epoch {epoch}, loss={mse(y, pred)}, pred={pred}')
print(f'epoch {epoch+1}, loss={mse(y, pred)}, pred={pred}')
# 模型反向传播
def backward(self, true, pred):
# 对误差求导
grad = mse.diff(true, pred)
# 反向更新层参数,反向!!!所以是 reversed,反着更新层
for layer in reversed(self.layers):
grad = layer.update(grad)
network = NetWork()
lr = 0.2
x = np.array([[1, 2, 3, 4]])
# 归一化处理
x = x/x.sum()
true = np.array([[0.1, 0.9, 0.1, 0.9]])
# 训练 启动!!!
network.fit(x, true, 520, 100)
==============================
输出:
epoch 0, loss=[[0.39587845]], pred=[[0.98663574 0.99087787 0.98450152 0.98239628]]
epoch 100, loss=[[0.00421098]], pred=[[0.14174121 0.98933029 0.13505519 0.97676462]]
epoch 200, loss=[[0.00305598]], pred=[[0.10597044 0.98722093 0.10535465 0.9674693 ]]
epoch 300, loss=[[0.00244499]], pred=[[0.10126192 0.98420283 0.10114187 0.95183559]]
epoch 400, loss=[[0.00180562]], pred=[[0.10027395 0.9796371 0.10024925 0.92966927]]
epoch 500, loss=[[0.00133652]], pred=[[0.1000518 0.97228184 0.10004804 0.91101897]]
epoch 520, loss=[[0.00125819]], pred=[[0.10003585 0.97040049 0.10003358 0.90874739]]
TensorFlow 验证
是骡子🫏是马🐎拉出来溜溜不就晓得了 验证方式
- 直接用TensorFlow框架计算一次
- 用手搓的神经网络计算一次
- 对比其得出的导数就晓得算对与否了
设置3层的神经网
python
x = tf.random.uniform((1, 2))
# 层 1 的参数
w1 = tf.random.uniform((2, 4))
b1 = tf.random.uniform((1, 4))
# 层 2 的参数
w2 = tf.random.uniform((4, 8))
b2 = tf.random.uniform((1, 8))
# 层 3 的参数
w3 = tf.random.uniform((8, 2))
b3 = tf.random.uniform((1, 2))
true = tf.constant([[0.5, 0.2]])
with tf.GradientTape() as tape:
tape.watch(x)
y = tf.nn.relu(x@w1+b1)
y = tf.nn.sigmoid(y@w2+b2)
y = tf.nn.sigmoid(y@w3+b3)
loss = tf.keras.losses.mse(true, y)
print('mse-loss', loss.numpy())
dY_dX = tape.gradient(loss, x)
print('loss对x的导数:', dY_dX.numpy())
==============================
输出:
mse-loss [0.43804157]
loss对x的导数: [[0.00133077 0.00124031]]
因为只是对比计算的结果,这里手搓的神经网络就简化了,仅保留反向传播的部分
python
class Linear_:
def __init__(self, weight, bias, activation):
self.weight = weight
self.bias = bias
self.activation = activation
self.t_temp = None
def __call__(self, x, parent):
self.t_temp = self.activation([email protected]+self.bias)
parent.layers.append(self)
return self.t_temp
def update(self, grad):
activation_diff_grad = self.activation.diff(self.t_temp) * grad
return activation_diff_grad @ self.weight.T
class NetWork_:
def __init__(self):
self.layers = []
# 这里使用前边 tensorflow 的权重、偏置参数
# 注意了!!!这里要把参数转为 numpy 类型
self.linear_1 = Linear_(w1.numpy(), b1.numpy(), activation=relu)
self.linear_2 = Linear_(w2.numpy(), b2.numpy(), activation=sigmoid)
self.linear_3 = Linear_(w3.numpy(), b3.numpy(), activation=sigmoid)
def __call__(self, x):
x = self.linear_1(x, self)
x = self.linear_2(x, self)
x = self.linear_3(x, self)
return x
def fit(self, x, y, epochs):
for epoch in range(epochs):
pred = self(x)
self.backward(y, pred)
def backward(self, true, pred):
print('mse-loss', mse(true, pred))
grad = mse.diff(true, pred)
for layer in reversed(self.layers):
grad = layer.update(grad)
# 这里只输出最后一次的计算结果
print('loss对x的导数:', grad)
network_ = NetWork_()
# 迎接你们的亡!!!👺
# 注意了!!!这里要把参数转为 numpy 类型
network_.fit(x.numpy(), true.numpy(), 1)
==============================
输出:
mse-loss [[0.43804157]]
loss对x的导数: [[0.00133077 0.00124031]]
快快快!你看☝️☝️☝️两者的结果是一样的,说明手搓出来的是对的
。k 文章水完啦🥳🎉🎊