多分类:可以识别出验证码(6位)中的1位:
python
import logging
import os
import h5py
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
IMG_HEIGHT = 50
IMG_WIDTH = 200
IMG_CHANNEL = 1
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
TRAIN_RECORD_STEP = 50
SUPER_PARAM_LEARNING_RATE = [0.1, 0.01, 0.001, 0.0001] # 采用指数标尺,随机采样,r= -4 * np.random.rand(): [-4,0] , 10^r
SUPER_PARAM_MINI_BATCH_SIZE = [16, 32, 64] #
SUPER_PARAM_LAYER1_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER2_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER3_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER_SIZE = [1, 2, 3] # 线性标尺,从1开始
SUPER_PARAM_DROP_OUT = [0.3, 0.5, 0.7] # 线性标尺,随机采样
SUPER_PARAM_LAMBD = [0.3, 0.5, 0.7] # 线性标尺,随机采样
# 调参经验:
# 网络层数,从1开始
# batch size:从128上下开始
# dropout:0.5
# L2正则:1.0
# 正负样本比例
# learing_rate太大:loss爆炸或者nan,太小:loss没反应
def get_logger():
if not hasattr(get_logger, "logger"):
no_format_logger = logging.getLogger("logger")
no_format_logger.setLevel(logging.DEBUG)
not_format_file_handler = logging.FileHandler("./costs.log")
not_format_file_handler.setFormatter(logging.Formatter('%(message)s'))
no_format_logger.addHandler(not_format_file_handler)
get_logger.logger = no_format_logger
return get_logger.logger
def get_image_data(img_path):
image = Image.open(img_path)
gray_img = image.convert('L')
data = np.array(gray_img)
return data
def get_image_label(img_path):
img_name = img_path.split("/")[-1]
img_code = str(img_name)[0]
return img_code
def show_one_img(x):
plt.imshow(x.reshape(50, 200, 3))
def mv_file(file_paths, target_path):
for file_path in file_paths:
with open(file_path, 'rb') as f:
file_bytes = f.read()
file_path = f.name
file_name = file_path.split("/")[-1]
new_save_file_path = target_path + "/" + file_name
with open(new_save_file_path, 'wb') as f:
f.write(file_bytes)
def split_samples(sample_dir_path):
print("start split samples...")
sample_train_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_train"
sample_verify_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_verify"
sample_test_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_test"
if not os.path.exists(sample_train_path):
os.makedirs(sample_train_path)
if not os.path.exists(sample_verify_path):
os.makedirs(sample_verify_path)
if not os.path.exists(sample_test_path):
os.makedirs(sample_test_path)
img_paths = []
for root, dirs, files in os.walk(sample_dir_path):
for file in files:
if str(file).endswith(".png"):
img_paths.append(os.path.join(root, file))
permutation = np.random.permutation(len(img_paths))
shuffled_img_paths = [img_paths[i] for i in permutation]
img_count = len(img_paths)
train_count = int(img_count * 0.8)
verify_count = int(img_count * 0.19)
train_img_paths = shuffled_img_paths[0:train_count]
verify_img_paths = shuffled_img_paths[train_count + 1:train_count + verify_count + 1]
test_img_paths = shuffled_img_paths[train_count + verify_count + 1:]
print(
f"total:{img_count}, split to train:{len(train_img_paths)}, verify:{len(verify_img_paths)}, test:{len(test_img_paths)}")
mv_file(train_img_paths, sample_train_path)
mv_file(verify_img_paths, sample_verify_path)
mv_file(test_img_paths, sample_test_path)
def char_to_label(char):
return LABELS.index(char)
def label_to_char(index):
return LABELS[index]
def save_h5_data(key, data, mode='w'):
with h5py.File("dataset.h5", mode) as f:
f.create_dataset(key, data=data)
def read_h5_data(key):
with h5py.File("dataset.h5", 'r') as f:
return np.array(f[key])
def file_to_data(data_set_path, name_X, name_Y):
img_paths = []
image_count = 0
for root, dirs, files in os.walk(data_set_path):
for file in files:
if file.endswith(".png"):
image_count += 1
img_paths.append(os.path.join(root, file))
print("找图片:{}张".format(image_count))
X = np.zeros((IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, image_count)).astype(np.int64)
Y = np.zeros((1, image_count)).astype(np.int64)
for simple_index in range(image_count):
# load data
img_path = img_paths[simple_index]
simple_data = get_image_data(img_path)
X[:, simple_index] = simple_data.reshape(IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, -1).ravel()
# load label
simple_label = get_image_label(img_path)
Y[:, simple_index] = char_to_label(simple_label)
save_h5_data(name_X, X, mode='a')
save_h5_data(name_Y, Y, mode='a')
def load_data_set(refresh=False):
if refresh:
if os.path.exists('dataset.h5'):
os.remove('dataset.h5')
file_to_data('./sample_train', 'train_set_x', 'train_set_y')
file_to_data('./sample_verify', 'verify_set_x', 'verify_set_y')
file_to_data('./sample_test', 'test_set_x', 'test_set_y')
# check_data_set('train_set_x', 'train_set_y')
# check_data_set('verify_set_x', 'verify_set_y')
# check_data_set('test_set_x', 'test_set_y')
train_set_x = read_h5_data('train_set_x')
train_set_y = read_h5_data('train_set_y')
verify_set_x = read_h5_data('verify_set_x')
verify_set_y = read_h5_data('verify_set_y')
return train_set_x, train_set_y, verify_set_x, verify_set_y
def check_data_set(name_X, name_Y):
X = read_h5_data(name_X)
Y = read_h5_data(name_Y)
img_index = np.random.randint(0, Y.shape[1])
print(f"im {img_index} is:", label_to_char(Y[:, img_index][0]))
show_one_img(X[:, img_index])
plt.show()
def convert_to_one_hot(Y, C):
Y = np.eye(C)[Y.reshape(-1)].T
return Y
def initialize_parameters(begin, is_adam_gd, layers_dims): # (12288,20,12,5)
if begin != 0:
h5_name = f"epoch_{begin}.h5"
parameters = read_parameters(h5_name)
return parameters
L = len(layers_dims)
parameters = {}
for l in range(1, L):
if is_adam_gd:
parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
else:
parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
parameters[f"b{l}"] = np.zeros((layers_dims[l], 1))
assert (parameters[f"W{l}"].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters[f"b{l}"].shape == (layers_dims[l], 1))
return parameters
def initialize_velocity(parameters):
L = len(parameters) // 2
v = {}
for l in range(1, L + 1):
v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
return v
def initialize_adam(parameters):
L = len(parameters) // 2
v = {}
s = {}
for l in range(1, L + 1):
v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
s[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
s[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
return v, s
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
m = X.shape[1]
mini_batches = []
np.random.seed(seed)
# 洗牌置换
permutation = np.random.permutation(m)
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation].reshape(Y.shape[0], m)
num_mini_batches = int(m / mini_batch_size)
for k in range(num_mini_batches):
# 分割
mini_batch_X = shuffled_X[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
if m % num_mini_batches != 0:
mini_batch_X = shuffled_X[:, num_mini_batches * mini_batch_size:m]
mini_batch_Y = shuffled_Y[:, num_mini_batches * mini_batch_size:m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def relu(Z):
return np.maximum(Z, 0)
def softmax(Z):
A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
return A
def bn_z(Z):
m = Z.shape[1]
U = np.sum(Z, axis=1, keepdims=True) / m
Z = Z - U
E = np.sum(np.square(Z), axis=1, keepdims=True) / m
Z = Z / np.sqrt(E + 1e-8)
r = 0.9
b = 0.01
Z = Z * r + b
return Z
def linear_activation_forward(A_prew, W, b, activation):
Z = np.dot(W, A_prew) + b
Z = bn_z(Z)
assert (Z.shape == W.shape[0], A_prew.shape[1])
if activation == "relu":
A = relu(Z)
elif activation == "softmax":
A = softmax(Z)
else:
assert (1 != 1), f"there is no support activation:{activation}"
assert (A.shape == (W.shape[0], A_prew.shape[1]))
cache = (A_prew, W, b, Z)
return A, cache
def forward_propagation(X, Y, parameters): # (12288,20,12,5)
caches = []
L = len(parameters) // 2
A = X
for l in range(1, L):
A_prew = A
A, cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
caches.append(cache)
AL, cacheL = linear_activation_forward(A, parameters[f"W{L}"], parameters[f"b{L}"], 'softmax')
assert (AL.shape == Y.shape)
caches.append(cacheL)
return AL, caches
def forward_propagation_with_dropout(X, Y, parameters, keep_prob=0.8): # (12288,20,12,5)
caches = []
L = len(parameters) // 2
A = X
for l in range(1, L):
A_prew = A
A, linear_cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
D = np.random.rand(A.shape[0], A.shape[1])
D = D < keep_prob
A = A * D
A = A / keep_prob
cache = (linear_cache, D)
caches.append(cache)
AL, linear_cacheL = linear_activation_forward(A, parameters[f"W{L}"], parameters[f"b{L}"], 'softmax')
assert (AL.shape == Y.shape)
# last A not do dropout
cacheL = (linear_cacheL, None)
caches.append(cacheL)
return AL, caches
def compute_cost(AL, Y_train):
m = Y_train.shape[1]
logprobs = -np.sum(np.multiply(Y_train, np.log(AL)), axis=0, keepdims=True)
cost = np.sum(logprobs) / m
return cost
def compute_cost_with_regularization(AL, Y, parameters, lambd):
m = Y.shape[1]
logprobs = -np.sum(np.multiply(Y, np.log(AL)), axis=0, keepdims=True)
cost = np.sum(logprobs) / m
sum_W = 0.
for key, value in parameters.items():
if str(key).startswith("W"):
W = parameters[key]
sum_W += np.sum(np.square(W))
L2_regularization_cost = lambd * sum_W / (2 * m)
return cost + L2_regularization_cost
def backward_propagation(AL, Y, caches):
m = Y.shape[1]
grads = {}
L = len(caches) # w1,w2,w3
Y = Y.reshape(AL.shape)
current_cache = caches[-1]
(AL_prew, WL, bL, ZL) = current_cache
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = AL - Y
grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T)
grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
assert (dZL.shape == ZL.shape)
assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
assert (grads[f"dW{L}"].shape == WL.shape)
assert (grads[f"db{L}"].shape == bL.shape)
# w2, w1
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward(grads[f"dA{c}"], cache, 'relu')
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def linear_activation_backward(dA, current_cache, activation):
m = dA.shape[1]
(A_prew, W, b, Z) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def linear_activation_backward_with_dropout(dA, current_cache, activation):
m = dA.shape[1]
((A_prew, W, b, Z), D) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def backward_propagation_with_dropout(AL, Y, caches, keep_prob):
m = Y.shape[1]
grads = {}
L = len(caches) # w1,w2,w3
Y = Y.reshape(AL.shape)
current_cache = caches[-1]
((AL_prew, WL, bL, ZL,), DL) = current_cache
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y , Z = W.A[l-1] + b
dZL = AL - Y
grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T) # dW3
grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # db3
grads[f"dA{L - 1}"] = np.dot(WL.T, dZL) # dA2
# reversed dropout, last dA not do dropout
((_, _, _, _,), D_prew) = caches[-2]
grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] * D_prew # dA2
grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] / keep_prob # dA2
assert (dZL.shape == ZL.shape)
assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
assert (grads[f"dW{L}"].shape == WL.shape)
assert (grads[f"db{L}"].shape == bL.shape)
# L-2, L-1
for c in reversed(range(1, L)):
cache = caches[c - 1] # cache[1]
dA_prew, dW, db = linear_activation_backward_with_dropout(grads[f"dA{c}"], cache, 'relu')
if c > 1:
cache_prew = caches[c - 2]
((_, _, _, _,), D_prew) = cache_prew
# reversed dropout
dA_prew = dA_prew * D_prew
dA_prew = dA_prew / keep_prob
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def backward_propagation_with_regularization(AL, Y, caches, lambd):
m = Y.shape[1]
grads = {}
L = len(caches) # w1,w2,w3
Y = Y.reshape(AL.shape)
current_cache = caches[-1]
(AL_prew, WL, bL, ZL) = current_cache
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = AL - Y
grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
assert (dZL.shape == ZL.shape)
assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
assert (grads[f"dW{L}"].shape == WL.shape)
assert (grads[f"db{L}"].shape == bL.shape)
# w2, w1
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward_with_regularization(grads[f"dA{c}"], cache, 'relu', lambd)
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def linear_activation_backward_with_regularization(dA, current_cache, activation, lambd):
m = dA.shape[1]
(A_prew, W, b, Z) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def linear_activation_backward_with_regularization_dropout(dA, current_cache, activation, lambd):
m = dA.shape[1]
((A_prew, W, b, Z), D) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def backward_propagation_with_regularization_dropout(AL, Y, caches, lambd, keep_prob):
m = Y.shape[1]
grads = {}
L = len(caches) # w1,w2,w3
Y = Y.reshape(AL.shape)
current_cache = caches[-1]
((AL_prew, WL, bL, ZL), DL) = current_cache
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = AL - Y
grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
((_, _, _, _,), D_prew) = caches[-2]
grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] * D_prew # dA2
grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] / keep_prob # dA2
assert (dZL.shape == ZL.shape)
assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
assert (grads[f"dW{L}"].shape == WL.shape)
assert (grads[f"db{L}"].shape == bL.shape)
# w2, w1
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward_with_regularization_dropout(grads[f"dA{c}"], cache, 'relu', lambd)
if c > 1:
cache_prew = caches[c - 2]
((_, _, _, _,), D_prew) = cache_prew
# reversed dropout
dA_prew = dA_prew * D_prew
dA_prew = dA_prew / keep_prob
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def update_parameters(grads, parameters, learning_rate):
L = len(parameters) // 2
for l in range(1, L + 1):
w_substract = learning_rate * grads[f"dW{l}"]
parameters[f"W{l}"] = parameters[f"W{l}"] - w_substract
parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]
return parameters
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
L = len(parameters) // 2
for l in range(L):
v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]
v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]
parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]
parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]
return parameters, v
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
L = len(parameters) // 2
v_corrected = {}
s_corrected = {}
for l in range(1, L + 1):
v[f"dW{l}"] = beta1 * v[f"dW{l}"] + (1 - beta1) * grads[f"dW{l}"]
v[f"db{l}"] = beta1 * v[f"db{l}"] + (1 - beta1) * grads[f"db{l}"]
v_corrected[f"dW{l}"] = v[f"dW{l}"] / (1 - np.power(beta1, t))
v_corrected[f"db{l}"] = v[f"db{l}"] / (1 - np.power(beta1, t))
s[f"dW{l}"] = beta2 * s[f"dW{l}"] + (1 - beta2) * np.power(grads[f"dW{l}"], 2)
s[f"db{l}"] = beta2 * s[f"db{l}"] + (1 - beta2) * np.power(grads[f"db{l}"], 2)
s_corrected[f"dW{l}"] = s[f"dW{l}"] / (1 - np.power(beta2, t))
s_corrected[f"db{l}"] = s[f"db{l}"] / (1 - np.power(beta2, t))
parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v_corrected[f"dW{l}"] / np.sqrt(
s_corrected[f"dW{l}"] + epsilon)
parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v_corrected[f"db{l}"] / np.sqrt(
s_corrected[f"db{l}"] + epsilon)
return parameters, v, s
def build_model(X_train, Y_train, layers_dims, learning_rate=0.0001, num_epochs=1000, mini_batch_size=32, beta1=0.9,
beat2=0.999, epsilon=1e-8, gd="sgd", lambd=0.7, keep_prob=0.8, print_cost=True, print_mini_cost=False,
begin=0):
t = 0
m = X_train.shape[1]
if gd == "sgd":
is_adam_gd = False
print("use gd: sgd")
elif gd == "momentum":
is_adam_gd = False
print("use gd: momentum")
elif gd == "adam":
is_adam_gd = True
print("use gd: adam")
else:
assert (1 != 1), "Not support gd: %s" % gd
parameters = initialize_parameters(begin, is_adam_gd, layers_dims)
if gd == "momentum":
v = initialize_velocity(parameters)
if gd == "adam":
v, s = initialize_adam(parameters)
costs = []
seed = 3
num_mini_batches = int(m / mini_batch_size)
for epoch in range(begin, num_epochs):
seed = seed + 1
mini_batches = random_mini_batches(X_train, Y_train, mini_batch_size, seed)
epoch_cost = 0.
mini_batch_index = 0
for mini_batch in mini_batches:
(mini_batch_X, mini_batch_Y) = mini_batch
if keep_prob == 1:
AL, caches = forward_propagation(mini_batch_X, mini_batch_Y, parameters)
else:
AL, caches = forward_propagation_with_dropout(mini_batch_X, mini_batch_Y, parameters, keep_prob)
if lambd == 0:
mini_batch_cost = compute_cost(AL, mini_batch_Y)
if keep_prob == 1:
grads = backward_propagation(AL, mini_batch_Y, caches)
else:
grads = backward_propagation_with_dropout(AL, mini_batch_Y, caches, keep_prob)
else:
mini_batch_cost = compute_cost_with_regularization(AL, mini_batch_Y, parameters, lambd)
if keep_prob == 1:
grads = backward_propagation_with_regularization(AL, mini_batch_Y, caches, lambd)
else:
grads = backward_propagation_with_regularization_dropout(AL, mini_batch_Y, caches, lambd, keep_prob)
if gd == "sgd":
parameters = update_parameters(grads, parameters, learning_rate)
elif gd == "momentum":
update_parameters_with_momentum(parameters, grads, v, beta=0.9, learning_rate=learning_rate)
else:
t = t + 1
parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate,
beta1, beat2, epsilon)
epoch_cost += mini_batch_cost / num_mini_batches
if print_mini_cost and mini_batch_index % 2 == 0:
print("[{}/{}]Mini batch {} cost: {}".format(epoch, num_epochs, mini_batch_index, mini_batch_cost))
mini_batch_index += 1
if print_cost == True and epoch % TRAIN_RECORD_STEP == 0 and epoch > 0:
get_logger().info(f"Cost after epoch {epoch}: {epoch_cost}")
# accuracy(epoch, parameters)
# save_parameters(h5_name=f"epoch_{epoch}.h5", parameters=parameters)
if print_cost == True and epoch > 0:
costs.append(epoch_cost)
return costs
def save_parameters(h5_name, parameters):
with h5py.File(h5_name, 'w') as f:
for key, value in parameters.items():
f.create_dataset(key, data=value)
def read_parameters(h5_name):
parameters = {}
with h5py.File(h5_name, 'r') as f:
for key in f.keys():
parameters[key] = np.array(f[key])
return parameters
def predict(X, Y, parameters):
AL, _ = forward_propagation(X, Y, parameters)
prediction = np.argmax(AL, axis=0).reshape(1, -1)
return prediction
def predict_one(parameters_name):
parameters = read_parameters(parameters_name)
np.random.seed(None)
permutation = np.random.permutation(30)
X_random = X_verify[:, permutation]
Y_random = Y_verify[:, permutation]
for index in range(0, 9):
plt.subplot(191 + index)
X_ = np.float32(X_random[:, index].reshape(-1, 1))
Y_ = np.float32(Y_random[:, index].reshape(-1, 1))
plt.imshow(X_.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))
A3, _ = forward_propagation(X_, Y_, parameters)
print(f"样本{index}预测结果:", label_to_char(np.argmax(A3)))
plt.show()
def show_cost(costs, learning_rate, lamd, keep_prob, layers_dims, num_epochs, mini_batch_size):
plt.plot(costs)
title = f"learning_reate={learning_rate}"
# title = f"layers_dims={layers_dims}"
# , lamd={lamd}, keep_prob={keep_prob}, layers_dims:{layers_dims}, num_epochs={num_epochs},mini_batch_size={mini_batch_size}"
plt.title(title)
plt.ylabel("cost")
plt.xlabel("iterations (per tens)")
def accuracy(epoch_num, parameters):
train_prediction = predict(X_train, Y_train, parameters)
train_accuracy = np.sum(train_prediction == Y_train_orig) / Y_train_orig.size
# 验证集准确率
verify_prediction = predict(X_verify, Y_verify, parameters)
verify_accuracy = np.sum(verify_prediction == Y_verify_orig) / Y_verify_orig.size
print(
f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
get_logger().info(
f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
def show_accuracy(num_epochs):
for i in range(0, num_epochs, TRAIN_RECORD_STEP):
if i == num_epochs:
return
parameters_name = f"epoch_{i + TRAIN_RECORD_STEP}.h5"
parameters = read_parameters(parameters_name)
accuracy(i, parameters)
if __name__ == "__main__":
# split_samples("/tmp/samples")
if os.path.exists("./costs.log"):
os.remove("./costs.log")
X_train_orig, Y_train_orig, X_verify_orig, Y_verify_orig = load_data_set(refresh=False)
X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1)
X_verify_flatten = X_verify_orig.reshape(X_verify_orig.shape[0], -1)
# 简单归一化
X_train = X_train_flatten / 255
X_verify = X_verify_flatten / 255
Y_train = convert_to_one_hot(Y_train_orig, len(LABELS))
Y_verify = convert_to_one_hot(Y_verify_orig, len(LABELS))
print(X_train.shape)
print(X_verify.shape)
print(Y_train.shape)
print(Y_verify.shape)
begin = 0
num_epochs = 200
# mini_batch_size = 253
keep_prob = 1
lambd = 0
learning_rate = 0.0008
sub_train = 5
mini_batch_size = 600
layers_dims = [IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 72, len(LABELS)]
index = 0
lrs = []
for i in range(4):
lrs.append(np.power(10, -4 * np.random.rand()))
layers_dims_list = [
[IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 1, len(LABELS)],
]
print("gen lr:", lrs)
for index, lr in enumerate(lrs):
costs = build_model(X_train, Y_train, layers_dims, num_epochs=num_epochs, gd="adam", keep_prob=keep_prob,
lambd=lambd, mini_batch_size=mini_batch_size, print_mini_cost=True, begin=begin,
learning_rate=lr)
plt.subplot(100 + 10 * sub_train + index + 1)
plt.show()
多任务(采用隐藏层硬共享)识别6位:
python
import logging
import os
import shutil
import h5py
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
IMG_HEIGHT = 50
IMG_WIDTH = 200
IMG_CHANNEL = 1
IMG_LABEL_NUM = 6
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
TRAIN_RECORD_STEP = 50
SUPER_PARAM_LEARNING_RATE = [0.1, 0.01, 0.001, 0.0001] # 采用指数标尺,随机采样,r= -4 * np.random.rand(): [-4,0] , 10^r
SUPER_PARAM_MINI_BATCH_SIZE = [16, 32, 64] #
SUPER_PARAM_LAYER1_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER2_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER3_NUM = [1, 2] # 线性标尺,随机采样,从16,32,128开始
SUPER_PARAM_LAYER_SIZE = [1, 2, 3] # 线性标尺,从1开始
SUPER_PARAM_DROP_OUT = [0.3, 0.5, 0.7] # 线性标尺,随机采样
SUPER_PARAM_LAMBD = [0.3, 0.5, 0.7] # 线性标尺,随机采样
# 调参经验:
# 网络层数,从1开始
# batch size:从128上下开始
# dropout:0.5
# L2正则:1.0
# 正负样本比例
# learing_rate太大:loss爆炸或者nan,太小:loss没反应
def get_logger():
if not hasattr(get_logger, "logger"):
no_format_logger = logging.getLogger("logger")
no_format_logger.setLevel(logging.DEBUG)
not_format_file_handler = logging.FileHandler("./costs.log")
not_format_file_handler.setFormatter(logging.Formatter('%(message)s'))
no_format_logger.addHandler(not_format_file_handler)
get_logger.logger = no_format_logger
return get_logger.logger
def get_image_data(img_path):
image = Image.open(img_path)
gray_img = image.convert('L')
data = np.array(gray_img)
return data
def get_image_label(img_path):
img_name = img_path.split("/")[-1]
img_codes = {}
for label_index in range(IMG_LABEL_NUM):
img_codes[f"image_code_{label_index}"] = str(img_name)[label_index]
return img_codes
def show_one_img(x):
plt.imshow(x.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))
def mv_file(file_paths, target_path):
for file_path in file_paths:
with open(file_path, 'rb') as f:
file_bytes = f.read()
file_path = f.name
file_name = file_path.split("/")[-1]
new_save_file_path = target_path + "/" + file_name
with open(new_save_file_path, 'wb') as f:
f.write(file_bytes)
def split_samples(sample_dir_path, train_ratio):
print("start split samples...")
sample_train_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_train"
sample_verify_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_verify"
sample_test_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_test"
if not os.path.exists(sample_train_path):
os.makedirs(sample_train_path)
if not os.path.exists(sample_verify_path):
os.makedirs(sample_verify_path)
if not os.path.exists(sample_test_path):
os.makedirs(sample_test_path)
img_paths = []
for root, dirs, files in os.walk(sample_dir_path):
for file in files:
if str(file).endswith(".png"):
img_paths.append(os.path.join(root, file))
permutation = np.random.permutation(len(img_paths))
shuffled_img_paths = [img_paths[i] for i in permutation]
img_count = len(img_paths)
train_count = int(img_count * train_ratio)
verify_count = int(img_count * (1 - train_ratio))
train_img_paths = shuffled_img_paths[0:train_count]
verify_img_paths = shuffled_img_paths[train_count + 1:train_count + verify_count + 1]
test_img_paths = shuffled_img_paths[train_count + verify_count + 1:]
print(
f"total:{img_count}, split to train:{len(train_img_paths)}, verify:{len(verify_img_paths)}, test:{len(test_img_paths)}")
mv_file(train_img_paths, sample_train_path)
mv_file(verify_img_paths, sample_verify_path)
mv_file(test_img_paths, sample_test_path)
def char_to_label(char):
return LABELS.index(char)
def label_to_char(index):
return LABELS[index]
def save_h5_data(key, data, mode='w'):
with h5py.File("dataset.h5", mode) as f:
f.create_dataset(key, data=data)
def read_h5_data(key):
with h5py.File("dataset.h5", 'r') as f:
return np.array(f[key])
def file_to_data(data_set_path, name_X, name_Y):
img_paths = []
image_count = 0
for root, dirs, files in os.walk(data_set_path):
for file in files:
if file.endswith(".png"):
image_count += 1
img_paths.append(os.path.join(root, file))
print("找图片:{}张".format(image_count))
X = np.zeros((IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, image_count)).astype(np.int64)
Y = np.zeros((IMG_LABEL_NUM, image_count)).astype(np.int64)
for simple_index in range(image_count):
# load data
img_path = img_paths[simple_index]
simple_data = get_image_data(img_path)
X[:, simple_index] = simple_data.reshape(IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, -1).ravel()
# load label
img_codes = get_image_label(img_path)
for label_index in range(IMG_LABEL_NUM):
Y[label_index:, simple_index] = char_to_label(img_codes[f"image_code_{label_index}"])
save_h5_data(name_X, X, mode='a')
save_h5_data(name_Y, Y, mode='a')
def load_data_set(refresh=False, train_ratio=0.8, sample_path=''):
if refresh:
if os.path.exists('./sample_train'):
shutil.rmtree("./sample_train")
if os.path.exists('./sample_verify'):
shutil.rmtree("./sample_verify")
if os.path.exists('./sample_test'):
shutil.rmtree("./sample_test")
split_samples(sample_path, train_ratio)
if os.path.exists('dataset.h5'):
os.remove('dataset.h5')
file_to_data('./sample_train', 'train_set_x', 'train_set_y')
file_to_data('./sample_verify', 'verify_set_x', 'verify_set_y')
file_to_data('./sample_test', 'test_set_x', 'test_set_y')
# check_data_set('train_set_x', 'train_set_y')
# check_data_set('verify_set_x', 'verify_set_y')
# check_data_set('test_set_x', 'test_set_y')
train_set_x = read_h5_data('train_set_x')
train_set_y = read_h5_data('train_set_y')
verify_set_x = read_h5_data('verify_set_x')
verify_set_y = read_h5_data('verify_set_y')
return train_set_x, train_set_y, verify_set_x, verify_set_y
def check_data_set(name_X, name_Y):
X = read_h5_data(name_X)
Y = read_h5_data(name_Y)
img_index = np.random.randint(0, Y.shape[1])
text = []
for label_index in range(IMG_LABEL_NUM):
text.append(label_to_char(Y[:, img_index][label_index]))
print(f"im {img_index} is:", text)
show_one_img(X[:, img_index])
plt.show()
def convert_to_one_hot(Y, C):
Y = np.eye(C)[Y.reshape(-1)].T
return Y
def initialize_parameters(begin, layers_dims): # (12288,20,12,5)
if begin != 0:
h5_name = f"epoch_{begin}.h5"
parameters = read_parameters(h5_name)
return parameters
L = len(layers_dims)
parameters = {}
for l in range(1, L):
if l < L - 1:
parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
parameters[f"b{l}"] = np.zeros((layers_dims[l], 1))
assert (parameters[f"W{l}"].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters[f"b{l}"].shape == (layers_dims[l], 1))
# 最后一层
for label_index in range(IMG_LABEL_NUM):
parameters[f"W{L - 1}_{label_index}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
parameters[f"b{L - 1}_{label_index}"] = np.zeros((layers_dims[l], 1))
assert (parameters[f"W{L - 1}_{label_index}"].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters[f"b{L - 1}_{label_index}"].shape == (layers_dims[l], 1))
return parameters
def initialize_velocity(parameters):
L = len(parameters) // 2
v = {}
for l in range(1, L + 1):
v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
return v
def initialize_adam(parameters):
L = len(parameters) // 2 - IMG_LABEL_NUM
v = {}
s = {}
for l in range(1, L + 1):
v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
s[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
s[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
for label_index in range(IMG_LABEL_NUM):
v[f"dW{L + 1}_{label_index}"] = np.zeros_like(parameters[f"W{L + 1}_{label_index}"])
v[f"db{L + 1}_{label_index}"] = np.zeros_like(parameters[f"b{L + 1}_{label_index}"])
s[f"dW{L + 1}_{label_index}"] = np.zeros_like(parameters[f"W{L + 1}_{label_index}"])
s[f"db{L + 1}_{label_index}"] = np.zeros_like(parameters[f"b{L + 1}_{label_index}"])
return v, s
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
m = X.shape[1]
mini_batches = []
np.random.seed(seed)
# 洗牌置换
permutation = np.random.permutation(m)
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, :, permutation]
num_mini_batches = int(m / mini_batch_size)
for k in range(num_mini_batches):
# 分割
mini_batch_X = shuffled_X[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
mini_batch_Y = shuffled_Y[:, :, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
if m % num_mini_batches != 0:
mini_batch_X = shuffled_X[:, num_mini_batches * mini_batch_size:m]
mini_batch_Y = shuffled_Y[:, :, num_mini_batches * mini_batch_size:m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def relu(Z):
return np.maximum(Z, 0)
def softmax(Z):
A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
return A
def bn_z(Z):
m = Z.shape[1]
U = np.sum(Z, axis=1, keepdims=True) / m
Z = Z - U
E = np.sum(np.square(Z), axis=1, keepdims=True) / m
Z = Z / np.sqrt(E + 1e-8)
r = 0.9
b = 0.01
Z = Z * r + b
return Z
def linear_activation_forward(A_prew, W, b, activation):
Z = np.dot(W, A_prew) + b
# Z = bn_z(Z)
assert (Z.shape == W.shape[0], A_prew.shape[1])
if activation == "relu":
A = relu(Z)
elif activation == "softmax":
A = softmax(Z)
else:
assert (1 != 1), f"there is no support activation:{activation}"
assert (A.shape == (W.shape[0], A_prew.shape[1]))
cache = (A_prew, W, b, Z)
return A, cache
def forward_propagation(X, Y, parameters): # (12288,20,12,5)
caches = []
L = len(parameters) // 2 - IMG_LABEL_NUM + 1
A = X
for l in range(1, L):
A_prew = A
A, cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
caches.append(cache)
ALS = {}
CACHELS = {}
for label_index in range(IMG_LABEL_NUM):
AL, cacheL = linear_activation_forward(A, parameters[f"W{L}_{label_index}"], parameters[f"b{L}_{label_index}"],
'softmax')
assert (AL.shape == (Y.shape[1], Y.shape[2]))
ALS[f"AL_{label_index}"] = AL
CACHELS[f"CACHEL_{label_index}"] = cacheL
return caches, ALS, CACHELS
def forward_propagation_with_dropout(X, Y, parameters, keep_prob=0.8): # (12288,20,12,5)
caches = []
L = len(parameters) // 2 - IMG_LABEL_NUM + 1
A = X
for l in range(1, L):
A_prew = A
A, linear_cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
D = np.random.rand(A.shape[0], A.shape[1])
D = D < keep_prob
A = A * D
A = A / keep_prob
cache = (linear_cache, D)
caches.append(cache)
ALS = {}
CACHELS = {}
for label_index in range(IMG_LABEL_NUM):
AL, cacheL = linear_activation_forward(A, parameters[f"W{L}_{label_index}"], parameters[f"b{L}_{label_index}"],
'softmax')
assert (AL.shape == (Y.shape[1], Y.shape[2]))
ALS[f"AL_{label_index}"] = AL
CACHELS[f"CACHEL_{label_index}"] = cacheL
return caches, ALS, CACHELS
def compute_cost(ALS, Y):
m = Y.shape[2]
cost = 0.
for label_index in range(IMG_LABEL_NUM):
logprobs = -np.sum(np.multiply(Y[label_index], np.log(ALS[f"AL_{label_index}"])), axis=0, keepdims=True)
cost += np.sum(logprobs) / m
return cost / IMG_LABEL_NUM
def compute_cost_with_regularization(ALS, Y, parameters, lambd):
cost = compute_cost(ALS, Y)
m = Y.shape[2]
sum_WL = None
for key, value in parameters.items():
if str(key).startswith("W") and str(key).count("_") > 0:
sum_WL = np.zeros_like(parameters[key])
break
for key, value in parameters.items():
if str(key).startswith("W") and str(key).count("_") > 0:
W = parameters[key]
sum_WL += W
sum_WL = sum_WL / IMG_LABEL_NUM
sum_W = 0.
for key, value in parameters.items():
if str(key).startswith("W") and str(key).count("_") <= 0:
W = parameters[key]
sum_W += np.sum(np.square(W))
sum_W += np.sum(np.square(sum_WL))
L2_regularization_cost = lambd * sum_W / (2 * m)
return cost + L2_regularization_cost
def backward_propagation(ALS, Y, caches, CACHELS):
m = Y.shape[2]
grads = {}
L = len(caches) + 1 # w1,w2,w3
dAL_prew = None
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
dAL_prew = np.zeros_like(AL_prew)
break
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T)
grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
dAL_prew += np.dot(WL.T, dZL)
assert (dAL_prew.shape == AL_prew.shape)
assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
# w2, w1
# 均值dA
dAL_prew = dAL_prew / IMG_LABEL_NUM
grads[f"dA{L - 1}"] = dAL_prew
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward(grads[f"dA{c}"], cache, 'relu')
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def linear_activation_backward(dA, current_cache, activation):
m = dA.shape[1]
(A_prew, W, b, Z) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def linear_activation_backward_with_dropout(dA, current_cache, activation):
m = dA.shape[1]
((A_prew, W, b, Z), D) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def backward_propagation_with_dropout(ALS, Y, caches, CACHELS, keep_prob):
m = Y.shape[2]
grads = {}
L = len(caches) + 1 # w1,w2,w3
dAL_prew = None
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
dAL_prew = np.zeros_like(AL_prew)
break
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T)
grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
dAL_prew += np.dot(WL.T, dZL)
assert (dAL_prew.shape == AL_prew.shape)
assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
dAL_prew = dAL_prew / IMG_LABEL_NUM
grads[f"dA{L - 1}"] = dAL_prew
# L-2, L-1
for c in reversed(range(1, L)):
cache = caches[c - 1] # cache[1]
dA_prew, dW, db = linear_activation_backward_with_dropout(grads[f"dA{c}"], cache, 'relu')
if c > 1:
cache_prew = caches[c - 2]
((_, _, _, _,), D_prew) = cache_prew
# reversed dropout
dA_prew = dA_prew * D_prew
dA_prew = dA_prew / keep_prob
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def backward_propagation_with_regularization(ALS, Y, caches, CACHELS, lambd):
m = Y.shape[2]
grads = {}
L = len(caches) + 1 # w1,w2,w3
dAL_prew = None
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
dAL_prew = np.zeros_like(AL_prew)
break
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
dAL_prew += np.dot(WL.T, dZL)
assert (dAL_prew.shape == AL_prew.shape)
assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
# w2, w1
# 均值dA
dAL_prew = dAL_prew / IMG_LABEL_NUM
grads[f"dA{L - 1}"] = dAL_prew
# w2, w1
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward_with_regularization(grads[f"dA{c}"], cache, 'relu', lambd)
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def linear_activation_backward_with_regularization(dA, current_cache, activation, lambd):
m = dA.shape[1]
(A_prew, W, b, Z) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def linear_activation_backward_with_regularization_dropout(dA, current_cache, activation, lambd):
m = dA.shape[1]
((A_prew, W, b, Z), D) = current_cache
if activation == 'relu':
dZ = 1 * dA
dZ[Z <= 0] = 0
else:
assert (1 != 1), f"not support backward activation:{activation}"
# Z[L] = W[L].A[L-1] + b[L]
dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prew = np.dot(W.T, dZ)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
assert (dA_prew.shape == A_prew.shape)
return dA_prew, dW, db
def backward_propagation_with_regularization_dropout(ALS, Y, caches, CACHELS, lambd, keep_prob):
m = Y.shape[2]
grads = {}
L = len(caches) + 1 # w1,w2,w3
dAL_prew = None
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
dAL_prew = np.zeros_like(AL_prew)
break
for label_index in range(IMG_LABEL_NUM):
cacheL = CACHELS[f"CACHEL_{label_index}"]
(AL_prew, WL, bL, ZL) = cacheL
# 多分类交叉熵损失函数,对softmax求偏导以及softmax对z求偏导的综合结果为:dL/dz = a - y
dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True) # Z = W.A[l-1] + b
dAL_prew += np.dot(WL.T, dZL)
assert (dAL_prew.shape == AL_prew.shape)
assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
# w2, w1
# 均值dA
dAL_prew = dAL_prew / IMG_LABEL_NUM
grads[f"dA{L - 1}"] = dAL_prew
# w2, w1
for c in reversed(range(1, L)):
cache = caches[c - 1]
dA_prew, dW, db = linear_activation_backward_with_regularization_dropout(grads[f"dA{c}"], cache, 'relu', lambd)
if c > 1:
cache_prew = caches[c - 2]
((_, _, _, _,), D_prew) = cache_prew
# reversed dropout
dA_prew = dA_prew * D_prew
dA_prew = dA_prew / keep_prob
grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
return grads
def update_parameters(grads, parameters, learning_rate):
L = len(parameters) // 2
for l in range(1, L + 1):
w_substract = learning_rate * grads[f"dW{l}"]
parameters[f"W{l}"] = parameters[f"W{l}"] - w_substract
parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]
return parameters
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
L = len(parameters) // 2
for l in range(L):
v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]
v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]
parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]
parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]
return parameters, v
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
L = len(parameters) // 2 - IMG_LABEL_NUM + 1
v_corrected = {}
s_corrected = {}
for l in range(1, L):
v[f"dW{l}"] = beta1 * v[f"dW{l}"] + (1 - beta1) * grads[f"dW{l}"]
v[f"db{l}"] = beta1 * v[f"db{l}"] + (1 - beta1) * grads[f"db{l}"]
v_corrected[f"dW{l}"] = v[f"dW{l}"] / (1 - np.power(beta1, t))
v_corrected[f"db{l}"] = v[f"db{l}"] / (1 - np.power(beta1, t))
s[f"dW{l}"] = beta2 * s[f"dW{l}"] + (1 - beta2) * np.power(grads[f"dW{l}"], 2)
s[f"db{l}"] = beta2 * s[f"db{l}"] + (1 - beta2) * np.power(grads[f"db{l}"], 2)
s_corrected[f"dW{l}"] = s[f"dW{l}"] / (1 - np.power(beta2, t))
s_corrected[f"db{l}"] = s[f"db{l}"] / (1 - np.power(beta2, t))
parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v_corrected[f"dW{l}"] / np.sqrt(
s_corrected[f"dW{l}"] + epsilon)
parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v_corrected[f"db{l}"] / np.sqrt(
s_corrected[f"db{l}"] + epsilon)
for label_index in range(IMG_LABEL_NUM):
v[f"dW{L}_{label_index}"] = beta1 * v[f"dW{L}_{label_index}"] + (1 - beta1) * grads[f"dW{L}_{label_index}"]
v[f"db{L}_{label_index}"] = beta1 * v[f"db{L}_{label_index}"] + (1 - beta1) * grads[f"db{L}_{label_index}"]
v_corrected[f"dW{L}_{label_index}"] = v[f"dW{L}_{label_index}"] / (1 - np.power(beta1, t))
v_corrected[f"db{L}_{label_index}"] = v[f"db{L}_{label_index}"] / (1 - np.power(beta1, t))
s[f"dW{L}_{label_index}"] = beta2 * s[f"dW{L}_{label_index}"] + (1 - beta2) * np.power(
grads[f"dW{L}_{label_index}"], 2)
s[f"db{L}_{label_index}"] = beta2 * s[f"db{L}_{label_index}"] + (1 - beta2) * np.power(
grads[f"db{L}_{label_index}"], 2)
s_corrected[f"dW{L}_{label_index}"] = s[f"dW{L}_{label_index}"] / (1 - np.power(beta2, t))
s_corrected[f"db{L}_{label_index}"] = s[f"db{L}_{label_index}"] / (1 - np.power(beta2, t))
parameters[f"W{L}_{label_index}"] = parameters[f"W{L}_{label_index}"] - learning_rate * v_corrected[
f"dW{L}_{label_index}"] / np.sqrt(
s_corrected[f"dW{L}_{label_index}"] + epsilon)
parameters[f"b{L}_{label_index}"] = parameters[f"b{L}_{label_index}"] - learning_rate * v_corrected[
f"db{L}_{label_index}"] / np.sqrt(
s_corrected[f"db{L}_{label_index}"] + epsilon)
return parameters, v, s
def build_model(X_train, Y_train, layers_dims, learning_rate=0.0001, num_epochs=1000, mini_batch_size=32, beta1=0.9,
beat2=0.999, epsilon=1e-8, gd="sgd", lambd=0.7, keep_prob=0.8, print_cost=True, print_mini_cost=False,
begin=0):
t = 0
m = X_train.shape[1]
costs = []
seed = 88
parameters = initialize_parameters(begin, layers_dims)
v, s = initialize_adam(parameters)
num_mini_batches = int(m / mini_batch_size)
for epoch in range(begin, num_epochs):
seed = seed + 1
mini_batches = random_mini_batches(X_train, Y_train, mini_batch_size, seed)
epoch_cost = 0.
mini_batch_index = 0
for mini_batch in mini_batches:
(mini_batch_X, mini_batch_Y) = mini_batch
if keep_prob == 1:
caches, ALS, CACHELS = forward_propagation(mini_batch_X, mini_batch_Y, parameters)
else:
caches, ALS, CACHELS = forward_propagation_with_dropout(mini_batch_X, mini_batch_Y, parameters,
keep_prob)
if lambd == 0:
mini_batch_cost = compute_cost(ALS, mini_batch_Y)
if keep_prob == 1:
grads = backward_propagation(ALS, mini_batch_Y, caches, CACHELS)
else:
grads = backward_propagation_with_dropout(ALS, mini_batch_Y, caches, CACHELS, keep_prob)
else:
mini_batch_cost = compute_cost_with_regularization(ALS, mini_batch_Y, parameters, lambd)
if keep_prob == 1:
grads = backward_propagation_with_regularization(ALS, mini_batch_Y, caches, CACHELS, lambd)
else:
grads = backward_propagation_with_regularization_dropout(ALS, mini_batch_Y, caches, CACHELS, lambd,
keep_prob)
t = t + 1
parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate,
beta1, beat2, epsilon)
epoch_cost += mini_batch_cost / num_mini_batches
if print_mini_cost and mini_batch_index % 2 == 0:
print("[{}/{}]Mini batch {} cost: {}".format(epoch, num_epochs, mini_batch_index, mini_batch_cost))
mini_batch_index += 1
if print_cost == True and epoch % TRAIN_RECORD_STEP == 0 and epoch > 0:
get_logger().info(f"Cost after epoch {epoch}: {epoch_cost}")
# accuracy(epoch, parameters)
save_parameters(h5_name=f"epoch_{epoch}.h5", parameters=parameters)
if print_cost == True and epoch > 0:
costs.append(epoch_cost)
return costs, parameters
def save_parameters(h5_name, parameters):
with h5py.File(h5_name, 'w') as f:
for key, value in parameters.items():
f.create_dataset(key, data=value)
def read_parameters(h5_name):
parameters = {}
with h5py.File(h5_name, 'r') as f:
for key in f.keys():
parameters[key] = np.array(f[key])
return parameters
def predict(X, Y, parameters):
_, ALS, _ = forward_propagation(X, Y, parameters)
predictions = np.zeros((IMG_LABEL_NUM, Y.shape[2]))
for label_index in range(IMG_LABEL_NUM):
AL = ALS[f"AL_{label_index}"]
prediction = np.argmax(AL, axis=0).reshape(1, -1)
predictions[label_index] = prediction
return predictions
def predict_one(parameters_name):
parameters = read_parameters(parameters_name)
np.random.seed(None)
permutation = np.random.permutation(30)
X_random = X_verify[:, permutation]
Y_random = Y_verify[:, permutation]
for index in range(0, 9):
plt.subplot(191 + index)
X_ = np.float32(X_random[:, index].reshape(-1, 1))
Y_ = np.float32(Y_random[:, index].reshape(-1, 1))
plt.imshow(X_.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))
A3, _ = forward_propagation(X_, Y_, parameters)
print(f"样本{index}预测结果:", label_to_char(np.argmax(A3)))
plt.show()
def show_cost(costs, learning_rate, lamd, keep_prob, layers_dims, num_epochs, mini_batch_size):
plt.plot(costs)
title = f"learning_reate={learning_rate}"
# title = f"layers_dims={layers_dims}"
# , lamd={lamd}, keep_prob={keep_prob}, layers_dims:{layers_dims}, num_epochs={num_epochs},mini_batch_size={mini_batch_size}"
plt.title(title)
plt.ylabel("cost")
plt.xlabel("iterations (per tens)")
def accuracy(epoch_num, parameters):
train_predictions = predict(X_train, Y_train, parameters)
train_accuracy = np.sum(train_predictions == Y_train_orig) / Y_train_orig.size
# 验证集准确率
verify_predictions = predict(X_verify, Y_verify, parameters)
verify_accuracy = np.sum(verify_predictions == Y_verify_orig) / Y_verify_orig.size
print(
f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
get_logger().info(
f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
def show_accuracy(num_epochs):
for i in range(0, num_epochs, TRAIN_RECORD_STEP):
if i == num_epochs:
return
parameters_name = f"epoch_{i + TRAIN_RECORD_STEP}.h5"
parameters = read_parameters(parameters_name)
accuracy(i, parameters)
def start(begin, num_epochs, drop_out, lambd, learning_rate, mini_batch_size, layers_dims, train_ratio, reload_data,
sample_path):
keep_prob = 1 - drop_out
if os.path.exists("./costs.log"):
os.remove("./costs.log")
X_train_orig, Y_train_orig, X_verify_orig, Y_verify_orig = load_data_set(refresh=reload_data,
train_ratio=train_ratio,
sample_path=sample_path)
X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1)
X_verify_flatten = X_verify_orig.reshape(X_verify_orig.shape[0], -1)
# 简单归一化
X_train = X_train_flatten / 255
X_verify = X_verify_flatten / 255
Y_train = np.zeros((IMG_LABEL_NUM, len(LABELS), X_train.shape[1]))
Y_verify = np.zeros((IMG_LABEL_NUM, len(LABELS), X_verify.shape[1]))
for label_index in range(IMG_LABEL_NUM):
Y_train[label_index] = convert_to_one_hot(Y_train_orig[label_index], len(LABELS))
Y_verify[label_index] = convert_to_one_hot(Y_verify_orig[label_index], len(LABELS))
costs, parameters = build_model(X_train, Y_train, layers_dims, num_epochs=num_epochs, gd="adam",
keep_prob=keep_prob,
lambd=lambd, mini_batch_size=mini_batch_size, print_mini_cost=True, begin=begin,
learning_rate=learning_rate)
show_accuracy(500)
if __name__ == "__main__":
begin = 0
num_epochs = 2000
drop_out = 0
lambd = 1
learning_rate = 0.0001
mini_batch_size = 32
layers_dims = [IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 120, 72, len(LABELS)]
train_ratio = 0.8
sample_path = "/tmp/samples"
start(begin, num_epochs, drop_out, lambd, learning_rate, mini_batch_size, layers_dims, train_ratio,
reload_data=True, sample_path=sample_path)