题目:识别数字0-9,做梯度检测来验证是否在梯度下降过程中存在问题,并可视化隐藏层
代码:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from scipy.optimize import minimize
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivation(z): # sigmoid函数求导
return sigmoid(z)*(1-sigmoid(z))
def one_hot(raw_y):
result = []
for i in raw_y: # 1-10
y_temp = np.zeros(10)
y_temp[i-1] = 1
result.append(y_temp)
return np.array(result) #返回成数组的格式
def sequence(theta1,theta2): #序列化
return np.append(theta1.flatten(), theta2.flatten()) # 方便后续调用scipy库方便, minimize要求初始化参数x0
# 只有1列,
def return_sequence(theta_sequence): #解序列化
theta1 = theta_sequence[:25*401].reshape(25, 401) # 解序列化 保证后续矩阵运算维度是一致的
theta2 = theta_sequence[25*401:].reshape(10, 26)
return theta1, theta2
def forward_propagation(theta_sequence, X):
theta1, theta2 = return_sequence(theta_sequence)
a1 = X
z2 = a1@theta1.T
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, values=1, axis=1)
z3 = a2@theta2.T
h = sigmoid(z3)
return a1, z2, a2, z3, h
def cost_function(theta_sequence, X, y):
a1, z2, a2, z3, h = forward_propagation(theta_sequence, X)
J = (-np.sum(y*np.log(h)+(1-y)*np.log(1-h)))/len(X)
return J
def reg_cost_function(theta_sequence, X, y, l=1):
first = np.sum(np.power(theta1[:, 1:], 2))
second =np.sum(np.power(theta2[:, 1:], 2))
reg = (first + second) * l / (2 * len(X))
return reg + cost_function(theta_sequence, X, y)
def gradient(theta_sequence, X, y): # 反向传播计算误差delta
theta1, theta2 = return_sequence(theta_sequence)
a1, z2, a2, z3, h = forward_propagation(theta_sequence, X)
d3 = h-y
d2 = d3@theta2[:,1:]*sigmoid_derivation(z2)
D2 = (d3.T@a2) / len(X)
D1 = (d2.T@a1) / len(X)
return sequence(D1, D2)
def reg_gradient(theta_sequence, X, y, l=1): # 正则化
D = gradient(theta_sequence, X, y)
D1, D2 = return_sequence(D)
theta1, theta2 = return_sequence(theta_sequence)
D1[:, 1:] = D1[:, 1:] + theta1[:, 1:] * l / len(X)
D2[:, 1:] = D2[:, 1:] + theta2[:, 1:] * l / len(X)
return sequence(D1, D2)
def neutral_network(X, y, l):
init_theta = np.random.uniform(-0.5, 0.5, 10285) # 随机化初始值,避免全为0结果只有一个特征
res = minimize(fun=reg_cost_function,
x0=init_theta,
args=(X, y, l),
method='TNC',
jac=reg_gradient,
options={'maxiter': 300}) # 设置最大迭代次数为300
return res
data = sio.loadmat('ex4data1.mat')
raw_x = data['X']
raw_y = data['y']
print(raw_y)
X = np.insert(raw_x, 0, values=1, axis=1) # 添加偏置单元
print(X.shape)
y = one_hot(raw_y)
print(y)
print(y.shape)
theta = sio.loadmat('ex4weights.mat')
theta1 = theta['Theta1']
theta2 = theta['Theta2']
print(theta1.shape)
print(theta2.shape)
theta_sequence = sequence(theta1, theta2)
print(reg_cost_function(theta_sequence, X, y, l=1))
l = 10
res = neutral_network(X, y, l)
raw_y = data['y'].reshape(5000) # 降为一维方便后面进行梯度检验时的比较
a1, z2, a2, z3, h = forward_propagation(res.x, X)
y_pred = np.argmax(h, axis=1)+1 # 取最大
accrancy = np.mean(y_pred == raw_y)
print(accrancy)
def hidden_layer(theta):
theta1, theta2 = return_sequence(theta)
hidden_layer = theta1[:, 1:]
fig, ax = plt.subplots(nrows=5, ncols=5, figsize=(8, 8), sharex=True, sharey=True)
for r in range(5):
for c in range(5):
ax[r, c].imshow(hidden_layer[5 * r + c].reshape(20, 20).T,
cmap='gray_r')
plt.xticks([])
plt.yticks([])
plt.show()
hidden_layer(res.x)
输出:
[[10]
[10]
[10]
...
[ 9]
[ 9]
[ 9]]
(5000, 401)
[[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
...
[0. 0. 0. ... 0. 1. 0.]
[0. 0. 0. ... 0. 1. 0.]
[0. 0. 0. ... 0. 1. 0.]]
(5000, 10)
(25, 401)
(10, 26)
0.38376985909092365
0.9394
进程已结束,退出代码0
可视化隐藏层
总结:与之前相比,这次代码中数学的运算多了很多,尤其是偏导部分;注意写代码前要多推导数学运算的过程不要出现差错;有所改进的是跟之前在minimize中加flatten相比,直接添加了一个函数对参数进行序列化操作来方便调用scipy库。