softmax多分类及多任务示例

多分类：可以识别出验证码（6位）中的1位：

python 复制代码

import logging
import os

import h5py
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

IMG_HEIGHT = 50
IMG_WIDTH = 200
IMG_CHANNEL = 1
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
          'v', 'w', 'x', 'y', 'z',
          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
TRAIN_RECORD_STEP = 50

SUPER_PARAM_LEARNING_RATE = [0.1, 0.01, 0.001, 0.0001]  # 采用指数标尺，随机采样，r= -4 * np.random.rand(): [-4,0] , 10^r
SUPER_PARAM_MINI_BATCH_SIZE = [16, 32, 64]  #

SUPER_PARAM_LAYER1_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER2_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER3_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER_SIZE = [1, 2, 3]  # 线性标尺，从1开始

SUPER_PARAM_DROP_OUT = [0.3, 0.5, 0.7]  # 线性标尺，随机采样
SUPER_PARAM_LAMBD = [0.3, 0.5, 0.7]  # 线性标尺，随机采样


# 调参经验：
# 网络层数，从1开始
# batch size：从128上下开始
# dropout:0.5
# L2正则：1.0
# 正负样本比例
# learing_rate太大：loss爆炸或者nan，太小：loss没反应

def get_logger():
    if not hasattr(get_logger, "logger"):
        no_format_logger = logging.getLogger("logger")
        no_format_logger.setLevel(logging.DEBUG)
        not_format_file_handler = logging.FileHandler("./costs.log")
        not_format_file_handler.setFormatter(logging.Formatter('%(message)s'))
        no_format_logger.addHandler(not_format_file_handler)
        get_logger.logger = no_format_logger
    return get_logger.logger


def get_image_data(img_path):
    image = Image.open(img_path)
    gray_img = image.convert('L')
    data = np.array(gray_img)
    return data


def get_image_label(img_path):
    img_name = img_path.split("/")[-1]
    img_code = str(img_name)[0]
    return img_code


def show_one_img(x):
    plt.imshow(x.reshape(50, 200, 3))


def mv_file(file_paths, target_path):
    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            file_bytes = f.read()
            file_path = f.name
            file_name = file_path.split("/")[-1]
        new_save_file_path = target_path + "/" + file_name
        with open(new_save_file_path, 'wb') as f:
            f.write(file_bytes)


def split_samples(sample_dir_path):
    print("start split samples...")
    sample_train_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_train"
    sample_verify_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_verify"
    sample_test_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_test"
    if not os.path.exists(sample_train_path):
        os.makedirs(sample_train_path)
    if not os.path.exists(sample_verify_path):
        os.makedirs(sample_verify_path)
    if not os.path.exists(sample_test_path):
        os.makedirs(sample_test_path)
    img_paths = []
    for root, dirs, files in os.walk(sample_dir_path):
        for file in files:
            if str(file).endswith(".png"):
                img_paths.append(os.path.join(root, file))
    permutation = np.random.permutation(len(img_paths))
    shuffled_img_paths = [img_paths[i] for i in permutation]
    img_count = len(img_paths)
    train_count = int(img_count * 0.8)
    verify_count = int(img_count * 0.19)
    train_img_paths = shuffled_img_paths[0:train_count]
    verify_img_paths = shuffled_img_paths[train_count + 1:train_count + verify_count + 1]
    test_img_paths = shuffled_img_paths[train_count + verify_count + 1:]
    print(
        f"total:{img_count}, split to train:{len(train_img_paths)}, verify:{len(verify_img_paths)}, test:{len(test_img_paths)}")
    mv_file(train_img_paths, sample_train_path)
    mv_file(verify_img_paths, sample_verify_path)
    mv_file(test_img_paths, sample_test_path)


def char_to_label(char):
    return LABELS.index(char)


def label_to_char(index):
    return LABELS[index]


def save_h5_data(key, data, mode='w'):
    with h5py.File("dataset.h5", mode) as f:
        f.create_dataset(key, data=data)


def read_h5_data(key):
    with h5py.File("dataset.h5", 'r') as f:
        return np.array(f[key])


def file_to_data(data_set_path, name_X, name_Y):
    img_paths = []
    image_count = 0
    for root, dirs, files in os.walk(data_set_path):
        for file in files:
            if file.endswith(".png"):
                image_count += 1
                img_paths.append(os.path.join(root, file))
    print("找图片：{}张".format(image_count))
    X = np.zeros((IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, image_count)).astype(np.int64)
    Y = np.zeros((1, image_count)).astype(np.int64)
    for simple_index in range(image_count):
        # load data
        img_path = img_paths[simple_index]
        simple_data = get_image_data(img_path)
        X[:, simple_index] = simple_data.reshape(IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, -1).ravel()
        # load label
        simple_label = get_image_label(img_path)
        Y[:, simple_index] = char_to_label(simple_label)
    save_h5_data(name_X, X, mode='a')
    save_h5_data(name_Y, Y, mode='a')


def load_data_set(refresh=False):
    if refresh:
        if os.path.exists('dataset.h5'):
            os.remove('dataset.h5')
        file_to_data('./sample_train', 'train_set_x', 'train_set_y')
        file_to_data('./sample_verify', 'verify_set_x', 'verify_set_y')
        file_to_data('./sample_test', 'test_set_x', 'test_set_y')
    # check_data_set('train_set_x', 'train_set_y')
    # check_data_set('verify_set_x', 'verify_set_y')
    # check_data_set('test_set_x', 'test_set_y')
    train_set_x = read_h5_data('train_set_x')
    train_set_y = read_h5_data('train_set_y')
    verify_set_x = read_h5_data('verify_set_x')
    verify_set_y = read_h5_data('verify_set_y')
    return train_set_x, train_set_y, verify_set_x, verify_set_y


def check_data_set(name_X, name_Y):
    X = read_h5_data(name_X)
    Y = read_h5_data(name_Y)
    img_index = np.random.randint(0, Y.shape[1])
    print(f"im {img_index} is:", label_to_char(Y[:, img_index][0]))
    show_one_img(X[:, img_index])
    plt.show()


def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)].T
    return Y


def initialize_parameters(begin, is_adam_gd, layers_dims):  # (12288,20,12,5)
    if begin != 0:
        h5_name = f"epoch_{begin}.h5"
        parameters = read_parameters(h5_name)
        return parameters
    L = len(layers_dims)
    parameters = {}
    for l in range(1, L):
        if is_adam_gd:
            parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
        else:
            parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        parameters[f"b{l}"] = np.zeros((layers_dims[l], 1))
        assert (parameters[f"W{l}"].shape == (layers_dims[l], layers_dims[l - 1]))
        assert (parameters[f"b{l}"].shape == (layers_dims[l], 1))
    return parameters


def initialize_velocity(parameters):
    L = len(parameters) // 2
    v = {}
    for l in range(1, L + 1):
        v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
    return v


def initialize_adam(parameters):
    L = len(parameters) // 2
    v = {}
    s = {}
    for l in range(1, L + 1):
        v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
        s[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        s[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
    return v, s


def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
    m = X.shape[1]
    mini_batches = []
    np.random.seed(seed)
    # 洗牌置换
    permutation = np.random.permutation(m)
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape(Y.shape[0], m)
    num_mini_batches = int(m / mini_batch_size)
    for k in range(num_mini_batches):
        # 分割
        mini_batch_X = shuffled_X[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    if m % num_mini_batches != 0:
        mini_batch_X = shuffled_X[:, num_mini_batches * mini_batch_size:m]
        mini_batch_Y = shuffled_Y[:, num_mini_batches * mini_batch_size:m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    return mini_batches


def relu(Z):
    return np.maximum(Z, 0)


def softmax(Z):
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
    return A


def bn_z(Z):
    m = Z.shape[1]
    U = np.sum(Z, axis=1, keepdims=True) / m
    Z = Z - U
    E = np.sum(np.square(Z), axis=1, keepdims=True) / m
    Z = Z / np.sqrt(E + 1e-8)
    r = 0.9
    b = 0.01
    Z = Z * r + b
    return Z


def linear_activation_forward(A_prew, W, b, activation):
    Z = np.dot(W, A_prew) + b
    Z = bn_z(Z)
    assert (Z.shape == W.shape[0], A_prew.shape[1])
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)
    else:
        assert (1 != 1), f"there is no support activation:{activation}"
    assert (A.shape == (W.shape[0], A_prew.shape[1]))
    cache = (A_prew, W, b, Z)
    return A, cache


def forward_propagation(X, Y, parameters):  # (12288,20,12,5)
    caches = []
    L = len(parameters) // 2
    A = X
    for l in range(1, L):
        A_prew = A
        A, cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
        caches.append(cache)
    AL, cacheL = linear_activation_forward(A, parameters[f"W{L}"], parameters[f"b{L}"], 'softmax')
    assert (AL.shape == Y.shape)
    caches.append(cacheL)
    return AL, caches


def forward_propagation_with_dropout(X, Y, parameters, keep_prob=0.8):  # (12288,20,12,5)
    caches = []
    L = len(parameters) // 2
    A = X
    for l in range(1, L):
        A_prew = A
        A, linear_cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
        D = np.random.rand(A.shape[0], A.shape[1])
        D = D < keep_prob
        A = A * D
        A = A / keep_prob
        cache = (linear_cache, D)
        caches.append(cache)
    AL, linear_cacheL = linear_activation_forward(A, parameters[f"W{L}"], parameters[f"b{L}"], 'softmax')
    assert (AL.shape == Y.shape)
    # last A not do dropout
    cacheL = (linear_cacheL, None)
    caches.append(cacheL)
    return AL, caches


def compute_cost(AL, Y_train):
    m = Y_train.shape[1]
    logprobs = -np.sum(np.multiply(Y_train, np.log(AL)), axis=0, keepdims=True)
    cost = np.sum(logprobs) / m
    return cost


def compute_cost_with_regularization(AL, Y, parameters, lambd):
    m = Y.shape[1]
    logprobs = -np.sum(np.multiply(Y, np.log(AL)), axis=0, keepdims=True)
    cost = np.sum(logprobs) / m
    sum_W = 0.
    for key, value in parameters.items():
        if str(key).startswith("W"):
            W = parameters[key]
            sum_W += np.sum(np.square(W))
    L2_regularization_cost = lambd * sum_W / (2 * m)
    return cost + L2_regularization_cost


def backward_propagation(AL, Y, caches):
    m = Y.shape[1]
    grads = {}
    L = len(caches)  # w1,w2,w3
    Y = Y.reshape(AL.shape)
    current_cache = caches[-1]
    (AL_prew, WL, bL, ZL) = current_cache

    # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
    dZL = AL - Y
    grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T)
    grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
    grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
    assert (dZL.shape == ZL.shape)
    assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
    assert (grads[f"dW{L}"].shape == WL.shape)
    assert (grads[f"db{L}"].shape == bL.shape)
    # w2, w1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward(grads[f"dA{c}"], cache, 'relu')
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def linear_activation_backward(dA, current_cache, activation):
    m = dA.shape[1]
    (A_prew, W, b, Z) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def linear_activation_backward_with_dropout(dA, current_cache, activation):
    m = dA.shape[1]
    ((A_prew, W, b, Z), D) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def backward_propagation_with_dropout(AL, Y, caches, keep_prob):
    m = Y.shape[1]
    grads = {}
    L = len(caches)  # w1,w2,w3
    Y = Y.reshape(AL.shape)
    current_cache = caches[-1]
    ((AL_prew, WL, bL, ZL,), DL) = current_cache

    # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y , Z = W.A[l-1] + b
    dZL = AL - Y
    grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T)  # dW3
    grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # db3
    grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)  # dA2
    # reversed dropout, last dA not do dropout
    ((_, _, _, _,), D_prew) = caches[-2]
    grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] * D_prew  # dA2
    grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] / keep_prob  # dA2

    assert (dZL.shape == ZL.shape)
    assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
    assert (grads[f"dW{L}"].shape == WL.shape)
    assert (grads[f"db{L}"].shape == bL.shape)
    # L-2, L-1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]  # cache[1]
        dA_prew, dW, db = linear_activation_backward_with_dropout(grads[f"dA{c}"], cache, 'relu')
        if c > 1:
            cache_prew = caches[c - 2]
            ((_, _, _, _,), D_prew) = cache_prew
            # reversed dropout
            dA_prew = dA_prew * D_prew
            dA_prew = dA_prew / keep_prob
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def backward_propagation_with_regularization(AL, Y, caches, lambd):
    m = Y.shape[1]
    grads = {}
    L = len(caches)  # w1,w2,w3
    Y = Y.reshape(AL.shape)
    current_cache = caches[-1]
    (AL_prew, WL, bL, ZL) = current_cache

    # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
    dZL = AL - Y
    grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
    grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
    grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
    assert (dZL.shape == ZL.shape)
    assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
    assert (grads[f"dW{L}"].shape == WL.shape)
    assert (grads[f"db{L}"].shape == bL.shape)
    # w2, w1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward_with_regularization(grads[f"dA{c}"], cache, 'relu', lambd)
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def linear_activation_backward_with_regularization(dA, current_cache, activation, lambd):
    m = dA.shape[1]
    (A_prew, W, b, Z) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def linear_activation_backward_with_regularization_dropout(dA, current_cache, activation, lambd):
    m = dA.shape[1]
    ((A_prew, W, b, Z), D) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def backward_propagation_with_regularization_dropout(AL, Y, caches, lambd, keep_prob):
    m = Y.shape[1]
    grads = {}
    L = len(caches)  # w1,w2,w3
    Y = Y.reshape(AL.shape)
    current_cache = caches[-1]
    ((AL_prew, WL, bL, ZL), DL) = current_cache

    # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
    dZL = AL - Y
    grads[f"dW{L}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
    grads[f"db{L}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
    grads[f"dA{L - 1}"] = np.dot(WL.T, dZL)
    ((_, _, _, _,), D_prew) = caches[-2]
    grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] * D_prew  # dA2
    grads[f"dA{L - 1}"] = grads[f"dA{L - 1}"] / keep_prob  # dA2

    assert (dZL.shape == ZL.shape)
    assert (grads[f"dA{L - 1}"].shape == AL_prew.shape)
    assert (grads[f"dW{L}"].shape == WL.shape)
    assert (grads[f"db{L}"].shape == bL.shape)
    # w2, w1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward_with_regularization_dropout(grads[f"dA{c}"], cache, 'relu', lambd)
        if c > 1:
            cache_prew = caches[c - 2]
            ((_, _, _, _,), D_prew) = cache_prew
            # reversed dropout
            dA_prew = dA_prew * D_prew
            dA_prew = dA_prew / keep_prob
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def update_parameters(grads, parameters, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        w_substract = learning_rate * grads[f"dW{l}"]
        parameters[f"W{l}"] = parameters[f"W{l}"] - w_substract
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]
    return parameters


def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    L = len(parameters) // 2

    for l in range(L):
        v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]
        v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]

        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]

    return parameters, v


def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
    L = len(parameters) // 2
    v_corrected = {}
    s_corrected = {}
    for l in range(1, L + 1):
        v[f"dW{l}"] = beta1 * v[f"dW{l}"] + (1 - beta1) * grads[f"dW{l}"]
        v[f"db{l}"] = beta1 * v[f"db{l}"] + (1 - beta1) * grads[f"db{l}"]
        v_corrected[f"dW{l}"] = v[f"dW{l}"] / (1 - np.power(beta1, t))
        v_corrected[f"db{l}"] = v[f"db{l}"] / (1 - np.power(beta1, t))

        s[f"dW{l}"] = beta2 * s[f"dW{l}"] + (1 - beta2) * np.power(grads[f"dW{l}"], 2)
        s[f"db{l}"] = beta2 * s[f"db{l}"] + (1 - beta2) * np.power(grads[f"db{l}"], 2)
        s_corrected[f"dW{l}"] = s[f"dW{l}"] / (1 - np.power(beta2, t))
        s_corrected[f"db{l}"] = s[f"db{l}"] / (1 - np.power(beta2, t))

        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v_corrected[f"dW{l}"] / np.sqrt(
            s_corrected[f"dW{l}"] + epsilon)
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v_corrected[f"db{l}"] / np.sqrt(
            s_corrected[f"db{l}"] + epsilon)
    return parameters, v, s


def build_model(X_train, Y_train, layers_dims, learning_rate=0.0001, num_epochs=1000, mini_batch_size=32, beta1=0.9,
                beat2=0.999, epsilon=1e-8, gd="sgd", lambd=0.7, keep_prob=0.8, print_cost=True, print_mini_cost=False,
                begin=0):
    t = 0
    m = X_train.shape[1]
    if gd == "sgd":
        is_adam_gd = False
        print("use gd: sgd")
    elif gd == "momentum":
        is_adam_gd = False
        print("use gd: momentum")
    elif gd == "adam":
        is_adam_gd = True
        print("use gd: adam")
    else:
        assert (1 != 1), "Not support gd: %s" % gd
    parameters = initialize_parameters(begin, is_adam_gd, layers_dims)
    if gd == "momentum":
        v = initialize_velocity(parameters)
    if gd == "adam":
        v, s = initialize_adam(parameters)
    costs = []
    seed = 3
    num_mini_batches = int(m / mini_batch_size)
    for epoch in range(begin, num_epochs):
        seed = seed + 1
        mini_batches = random_mini_batches(X_train, Y_train, mini_batch_size, seed)
        epoch_cost = 0.
        mini_batch_index = 0
        for mini_batch in mini_batches:
            (mini_batch_X, mini_batch_Y) = mini_batch
            if keep_prob == 1:
                AL, caches = forward_propagation(mini_batch_X, mini_batch_Y, parameters)
            else:
                AL, caches = forward_propagation_with_dropout(mini_batch_X, mini_batch_Y, parameters, keep_prob)
            if lambd == 0:
                mini_batch_cost = compute_cost(AL, mini_batch_Y)
                if keep_prob == 1:
                    grads = backward_propagation(AL, mini_batch_Y, caches)
                else:
                    grads = backward_propagation_with_dropout(AL, mini_batch_Y, caches, keep_prob)
            else:
                mini_batch_cost = compute_cost_with_regularization(AL, mini_batch_Y, parameters, lambd)
                if keep_prob == 1:
                    grads = backward_propagation_with_regularization(AL, mini_batch_Y, caches, lambd)
                else:
                    grads = backward_propagation_with_regularization_dropout(AL, mini_batch_Y, caches, lambd, keep_prob)
            if gd == "sgd":
                parameters = update_parameters(grads, parameters, learning_rate)
            elif gd == "momentum":
                update_parameters_with_momentum(parameters, grads, v, beta=0.9, learning_rate=learning_rate)
            else:
                t = t + 1
                parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate,
                                                               beta1, beat2, epsilon)
            epoch_cost += mini_batch_cost / num_mini_batches
            if print_mini_cost and mini_batch_index % 2 == 0:
                print("[{}/{}]Mini batch {} cost: {}".format(epoch, num_epochs, mini_batch_index, mini_batch_cost))
            mini_batch_index += 1
        if print_cost == True and epoch % TRAIN_RECORD_STEP == 0 and epoch > 0:
            get_logger().info(f"Cost after epoch {epoch}: {epoch_cost}")
            # accuracy(epoch, parameters)
            # save_parameters(h5_name=f"epoch_{epoch}.h5", parameters=parameters)
        if print_cost == True and epoch > 0:
            costs.append(epoch_cost)
    return costs


def save_parameters(h5_name, parameters):
    with h5py.File(h5_name, 'w') as f:
        for key, value in parameters.items():
            f.create_dataset(key, data=value)


def read_parameters(h5_name):
    parameters = {}
    with h5py.File(h5_name, 'r') as f:
        for key in f.keys():
            parameters[key] = np.array(f[key])
    return parameters


def predict(X, Y, parameters):
    AL, _ = forward_propagation(X, Y, parameters)
    prediction = np.argmax(AL, axis=0).reshape(1, -1)
    return prediction


def predict_one(parameters_name):
    parameters = read_parameters(parameters_name)
    np.random.seed(None)
    permutation = np.random.permutation(30)
    X_random = X_verify[:, permutation]
    Y_random = Y_verify[:, permutation]
    for index in range(0, 9):
        plt.subplot(191 + index)
        X_ = np.float32(X_random[:, index].reshape(-1, 1))
        Y_ = np.float32(Y_random[:, index].reshape(-1, 1))
        plt.imshow(X_.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))
        A3, _ = forward_propagation(X_, Y_, parameters)
        print(f"样本{index}预测结果：", label_to_char(np.argmax(A3)))
    plt.show()


def show_cost(costs, learning_rate, lamd, keep_prob, layers_dims, num_epochs, mini_batch_size):
    plt.plot(costs)
    title = f"learning_reate={learning_rate}"
    # title = f"layers_dims={layers_dims}"
    # , lamd={lamd}, keep_prob={keep_prob}, layers_dims:{layers_dims}, num_epochs={num_epochs},mini_batch_size={mini_batch_size}"
    plt.title(title)
    plt.ylabel("cost")
    plt.xlabel("iterations (per tens)")


def accuracy(epoch_num, parameters):
    train_prediction = predict(X_train, Y_train, parameters)
    train_accuracy = np.sum(train_prediction == Y_train_orig) / Y_train_orig.size
    # 验证集准确率
    verify_prediction = predict(X_verify, Y_verify, parameters)
    verify_accuracy = np.sum(verify_prediction == Y_verify_orig) / Y_verify_orig.size
    print(
        f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
    get_logger().info(
        f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")


def show_accuracy(num_epochs):
    for i in range(0, num_epochs, TRAIN_RECORD_STEP):
        if i == num_epochs:
            return
        parameters_name = f"epoch_{i + TRAIN_RECORD_STEP}.h5"
        parameters = read_parameters(parameters_name)
        accuracy(i, parameters)


if __name__ == "__main__":
    # split_samples("/tmp/samples")
    if os.path.exists("./costs.log"):
        os.remove("./costs.log")
    X_train_orig, Y_train_orig, X_verify_orig, Y_verify_orig = load_data_set(refresh=False)
    X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1)
    X_verify_flatten = X_verify_orig.reshape(X_verify_orig.shape[0], -1)
    # 简单归一化

    X_train = X_train_flatten / 255
    X_verify = X_verify_flatten / 255
    Y_train = convert_to_one_hot(Y_train_orig, len(LABELS))
    Y_verify = convert_to_one_hot(Y_verify_orig, len(LABELS))

    print(X_train.shape)
    print(X_verify.shape)
    print(Y_train.shape)
    print(Y_verify.shape)
    begin = 0
    num_epochs = 200
    # mini_batch_size = 253
    keep_prob = 1
    lambd = 0
    learning_rate = 0.0008
    sub_train = 5
    mini_batch_size = 600
    layers_dims = [IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 72, len(LABELS)]
    index = 0
    lrs = []
    for i in range(4):
        lrs.append(np.power(10, -4 * np.random.rand()))
    layers_dims_list = [
        [IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 1, len(LABELS)],
    ]
    print("gen lr:", lrs)
    for index, lr in enumerate(lrs):
        costs = build_model(X_train, Y_train, layers_dims, num_epochs=num_epochs, gd="adam", keep_prob=keep_prob,
                            lambd=lambd, mini_batch_size=mini_batch_size, print_mini_cost=True, begin=begin,
                            learning_rate=lr)
        plt.subplot(100 + 10 * sub_train + index + 1)
    plt.show()

多任务（采用隐藏层硬共享）识别6位：

python 复制代码

import logging
import os
import shutil

import h5py
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

IMG_HEIGHT = 50
IMG_WIDTH = 200
IMG_CHANNEL = 1
IMG_LABEL_NUM = 6
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
          'v', 'w', 'x', 'y', 'z',
          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
TRAIN_RECORD_STEP = 50

SUPER_PARAM_LEARNING_RATE = [0.1, 0.01, 0.001, 0.0001]  # 采用指数标尺，随机采样，r= -4 * np.random.rand(): [-4,0] , 10^r
SUPER_PARAM_MINI_BATCH_SIZE = [16, 32, 64]  #

SUPER_PARAM_LAYER1_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER2_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER3_NUM = [1, 2]  # 线性标尺，随机采样，从16，32，128开始
SUPER_PARAM_LAYER_SIZE = [1, 2, 3]  # 线性标尺，从1开始

SUPER_PARAM_DROP_OUT = [0.3, 0.5, 0.7]  # 线性标尺，随机采样
SUPER_PARAM_LAMBD = [0.3, 0.5, 0.7]  # 线性标尺，随机采样


# 调参经验：
# 网络层数，从1开始
# batch size：从128上下开始
# dropout:0.5
# L2正则：1.0
# 正负样本比例
# learing_rate太大：loss爆炸或者nan，太小：loss没反应

def get_logger():
    if not hasattr(get_logger, "logger"):
        no_format_logger = logging.getLogger("logger")
        no_format_logger.setLevel(logging.DEBUG)
        not_format_file_handler = logging.FileHandler("./costs.log")
        not_format_file_handler.setFormatter(logging.Formatter('%(message)s'))
        no_format_logger.addHandler(not_format_file_handler)
        get_logger.logger = no_format_logger
    return get_logger.logger


def get_image_data(img_path):
    image = Image.open(img_path)
    gray_img = image.convert('L')
    data = np.array(gray_img)
    return data


def get_image_label(img_path):
    img_name = img_path.split("/")[-1]
    img_codes = {}
    for label_index in range(IMG_LABEL_NUM):
        img_codes[f"image_code_{label_index}"] = str(img_name)[label_index]
    return img_codes


def show_one_img(x):
    plt.imshow(x.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))


def mv_file(file_paths, target_path):
    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            file_bytes = f.read()
            file_path = f.name
            file_name = file_path.split("/")[-1]
        new_save_file_path = target_path + "/" + file_name
        with open(new_save_file_path, 'wb') as f:
            f.write(file_bytes)


def split_samples(sample_dir_path, train_ratio):
    print("start split samples...")
    sample_train_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_train"
    sample_verify_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_verify"
    sample_test_path = os.path.dirname(os.path.abspath(__file__)) + "/sample_test"
    if not os.path.exists(sample_train_path):
        os.makedirs(sample_train_path)
    if not os.path.exists(sample_verify_path):
        os.makedirs(sample_verify_path)
    if not os.path.exists(sample_test_path):
        os.makedirs(sample_test_path)
    img_paths = []
    for root, dirs, files in os.walk(sample_dir_path):
        for file in files:
            if str(file).endswith(".png"):
                img_paths.append(os.path.join(root, file))
    permutation = np.random.permutation(len(img_paths))
    shuffled_img_paths = [img_paths[i] for i in permutation]
    img_count = len(img_paths)
    train_count = int(img_count * train_ratio)
    verify_count = int(img_count * (1 - train_ratio))
    train_img_paths = shuffled_img_paths[0:train_count]
    verify_img_paths = shuffled_img_paths[train_count + 1:train_count + verify_count + 1]
    test_img_paths = shuffled_img_paths[train_count + verify_count + 1:]
    print(
        f"total:{img_count}, split to train:{len(train_img_paths)}, verify:{len(verify_img_paths)}, test:{len(test_img_paths)}")
    mv_file(train_img_paths, sample_train_path)
    mv_file(verify_img_paths, sample_verify_path)
    mv_file(test_img_paths, sample_test_path)


def char_to_label(char):
    return LABELS.index(char)


def label_to_char(index):
    return LABELS[index]


def save_h5_data(key, data, mode='w'):
    with h5py.File("dataset.h5", mode) as f:
        f.create_dataset(key, data=data)


def read_h5_data(key):
    with h5py.File("dataset.h5", 'r') as f:
        return np.array(f[key])


def file_to_data(data_set_path, name_X, name_Y):
    img_paths = []
    image_count = 0
    for root, dirs, files in os.walk(data_set_path):
        for file in files:
            if file.endswith(".png"):
                image_count += 1
                img_paths.append(os.path.join(root, file))
    print("找图片：{}张".format(image_count))
    X = np.zeros((IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, image_count)).astype(np.int64)
    Y = np.zeros((IMG_LABEL_NUM, image_count)).astype(np.int64)
    for simple_index in range(image_count):
        # load data
        img_path = img_paths[simple_index]
        simple_data = get_image_data(img_path)
        X[:, simple_index] = simple_data.reshape(IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, -1).ravel()
        # load label
        img_codes = get_image_label(img_path)
        for label_index in range(IMG_LABEL_NUM):
            Y[label_index:, simple_index] = char_to_label(img_codes[f"image_code_{label_index}"])
    save_h5_data(name_X, X, mode='a')
    save_h5_data(name_Y, Y, mode='a')


def load_data_set(refresh=False, train_ratio=0.8, sample_path=''):
    if refresh:
        if os.path.exists('./sample_train'):
            shutil.rmtree("./sample_train")
        if os.path.exists('./sample_verify'):
            shutil.rmtree("./sample_verify")
        if os.path.exists('./sample_test'):
            shutil.rmtree("./sample_test")
        split_samples(sample_path, train_ratio)

        if os.path.exists('dataset.h5'):
            os.remove('dataset.h5')
        file_to_data('./sample_train', 'train_set_x', 'train_set_y')
        file_to_data('./sample_verify', 'verify_set_x', 'verify_set_y')
        file_to_data('./sample_test', 'test_set_x', 'test_set_y')
    # check_data_set('train_set_x', 'train_set_y')
    # check_data_set('verify_set_x', 'verify_set_y')
    # check_data_set('test_set_x', 'test_set_y')
    train_set_x = read_h5_data('train_set_x')
    train_set_y = read_h5_data('train_set_y')
    verify_set_x = read_h5_data('verify_set_x')
    verify_set_y = read_h5_data('verify_set_y')
    return train_set_x, train_set_y, verify_set_x, verify_set_y


def check_data_set(name_X, name_Y):
    X = read_h5_data(name_X)
    Y = read_h5_data(name_Y)
    img_index = np.random.randint(0, Y.shape[1])
    text = []
    for label_index in range(IMG_LABEL_NUM):
        text.append(label_to_char(Y[:, img_index][label_index]))
    print(f"im {img_index} is:", text)
    show_one_img(X[:, img_index])
    plt.show()


def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)].T
    return Y


def initialize_parameters(begin, layers_dims):  # (12288,20,12,5)
    if begin != 0:
        h5_name = f"epoch_{begin}.h5"
        parameters = read_parameters(h5_name)
        return parameters
    L = len(layers_dims)
    parameters = {}
    for l in range(1, L):
        if l < L - 1:
            parameters[f"W{l}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
            parameters[f"b{l}"] = np.zeros((layers_dims[l], 1))
            assert (parameters[f"W{l}"].shape == (layers_dims[l], layers_dims[l - 1]))
            assert (parameters[f"b{l}"].shape == (layers_dims[l], 1))
        # 最后一层
        for label_index in range(IMG_LABEL_NUM):
            parameters[f"W{L - 1}_{label_index}"] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.1
            parameters[f"b{L - 1}_{label_index}"] = np.zeros((layers_dims[l], 1))
            assert (parameters[f"W{L - 1}_{label_index}"].shape == (layers_dims[l], layers_dims[l - 1]))
            assert (parameters[f"b{L - 1}_{label_index}"].shape == (layers_dims[l], 1))
    return parameters


def initialize_velocity(parameters):
    L = len(parameters) // 2
    v = {}
    for l in range(1, L + 1):
        v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
    return v


def initialize_adam(parameters):
    L = len(parameters) // 2 - IMG_LABEL_NUM
    v = {}
    s = {}
    for l in range(1, L + 1):
        v[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        v[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
        s[f"dW{l}"] = np.zeros_like(parameters[f"W{l}"])
        s[f"db{l}"] = np.zeros_like(parameters[f"b{l}"])
    for label_index in range(IMG_LABEL_NUM):
        v[f"dW{L + 1}_{label_index}"] = np.zeros_like(parameters[f"W{L + 1}_{label_index}"])
        v[f"db{L + 1}_{label_index}"] = np.zeros_like(parameters[f"b{L + 1}_{label_index}"])
        s[f"dW{L + 1}_{label_index}"] = np.zeros_like(parameters[f"W{L + 1}_{label_index}"])
        s[f"db{L + 1}_{label_index}"] = np.zeros_like(parameters[f"b{L + 1}_{label_index}"])
    return v, s


def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
    m = X.shape[1]
    mini_batches = []
    np.random.seed(seed)
    # 洗牌置换
    permutation = np.random.permutation(m)
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, :, permutation]
    num_mini_batches = int(m / mini_batch_size)
    for k in range(num_mini_batches):
        # 分割
        mini_batch_X = shuffled_X[:, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[:, :, k * mini_batch_size:k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    if m % num_mini_batches != 0:
        mini_batch_X = shuffled_X[:, num_mini_batches * mini_batch_size:m]
        mini_batch_Y = shuffled_Y[:, :, num_mini_batches * mini_batch_size:m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    return mini_batches


def relu(Z):
    return np.maximum(Z, 0)


def softmax(Z):
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
    return A


def bn_z(Z):
    m = Z.shape[1]
    U = np.sum(Z, axis=1, keepdims=True) / m
    Z = Z - U
    E = np.sum(np.square(Z), axis=1, keepdims=True) / m
    Z = Z / np.sqrt(E + 1e-8)
    r = 0.9
    b = 0.01
    Z = Z * r + b
    return Z


def linear_activation_forward(A_prew, W, b, activation):
    Z = np.dot(W, A_prew) + b
    # Z = bn_z(Z)
    assert (Z.shape == W.shape[0], A_prew.shape[1])
    if activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)
    else:
        assert (1 != 1), f"there is no support activation:{activation}"
    assert (A.shape == (W.shape[0], A_prew.shape[1]))
    cache = (A_prew, W, b, Z)
    return A, cache


def forward_propagation(X, Y, parameters):  # (12288,20,12,5)
    caches = []
    L = len(parameters) // 2 - IMG_LABEL_NUM + 1
    A = X
    for l in range(1, L):
        A_prew = A
        A, cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
        caches.append(cache)
    ALS = {}
    CACHELS = {}
    for label_index in range(IMG_LABEL_NUM):
        AL, cacheL = linear_activation_forward(A, parameters[f"W{L}_{label_index}"], parameters[f"b{L}_{label_index}"],
                                               'softmax')
        assert (AL.shape == (Y.shape[1], Y.shape[2]))
        ALS[f"AL_{label_index}"] = AL
        CACHELS[f"CACHEL_{label_index}"] = cacheL
    return caches, ALS, CACHELS


def forward_propagation_with_dropout(X, Y, parameters, keep_prob=0.8):  # (12288,20,12,5)
    caches = []
    L = len(parameters) // 2 - IMG_LABEL_NUM + 1
    A = X
    for l in range(1, L):
        A_prew = A
        A, linear_cache = linear_activation_forward(A_prew, parameters[f"W{l}"], parameters[f"b{l}"], 'relu')
        D = np.random.rand(A.shape[0], A.shape[1])
        D = D < keep_prob
        A = A * D
        A = A / keep_prob
        cache = (linear_cache, D)
        caches.append(cache)

    ALS = {}
    CACHELS = {}
    for label_index in range(IMG_LABEL_NUM):
        AL, cacheL = linear_activation_forward(A, parameters[f"W{L}_{label_index}"], parameters[f"b{L}_{label_index}"],
                                               'softmax')
        assert (AL.shape == (Y.shape[1], Y.shape[2]))
        ALS[f"AL_{label_index}"] = AL
        CACHELS[f"CACHEL_{label_index}"] = cacheL
    return caches, ALS, CACHELS


def compute_cost(ALS, Y):
    m = Y.shape[2]
    cost = 0.
    for label_index in range(IMG_LABEL_NUM):
        logprobs = -np.sum(np.multiply(Y[label_index], np.log(ALS[f"AL_{label_index}"])), axis=0, keepdims=True)
        cost += np.sum(logprobs) / m
    return cost / IMG_LABEL_NUM


def compute_cost_with_regularization(ALS, Y, parameters, lambd):
    cost = compute_cost(ALS, Y)
    m = Y.shape[2]
    sum_WL = None
    for key, value in parameters.items():
        if str(key).startswith("W") and str(key).count("_") > 0:
            sum_WL = np.zeros_like(parameters[key])
            break
    for key, value in parameters.items():
        if str(key).startswith("W") and str(key).count("_") > 0:
            W = parameters[key]
            sum_WL += W
    sum_WL = sum_WL / IMG_LABEL_NUM
    sum_W = 0.
    for key, value in parameters.items():
        if str(key).startswith("W") and str(key).count("_") <= 0:
            W = parameters[key]
            sum_W += np.sum(np.square(W))
    sum_W += np.sum(np.square(sum_WL))
    L2_regularization_cost = lambd * sum_W / (2 * m)
    return cost + L2_regularization_cost


def backward_propagation(ALS, Y, caches, CACHELS):
    m = Y.shape[2]
    grads = {}
    L = len(caches) + 1  # w1,w2,w3
    dAL_prew = None
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        dAL_prew = np.zeros_like(AL_prew)
        break
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
        dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
        grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T)
        grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
        dAL_prew += np.dot(WL.T, dZL)
        assert (dAL_prew.shape == AL_prew.shape)
        assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
        assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
    # w2, w1
    # 均值dA
    dAL_prew = dAL_prew / IMG_LABEL_NUM
    grads[f"dA{L - 1}"] = dAL_prew
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward(grads[f"dA{c}"], cache, 'relu')
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def linear_activation_backward(dA, current_cache, activation):
    m = dA.shape[1]
    (A_prew, W, b, Z) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def linear_activation_backward_with_dropout(dA, current_cache, activation):
    m = dA.shape[1]
    ((A_prew, W, b, Z), D) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def backward_propagation_with_dropout(ALS, Y, caches, CACHELS, keep_prob):
    m = Y.shape[2]
    grads = {}
    L = len(caches) + 1  # w1,w2,w3
    dAL_prew = None
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        dAL_prew = np.zeros_like(AL_prew)
        break
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
        dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
        grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T)
        grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
        dAL_prew += np.dot(WL.T, dZL)
        assert (dAL_prew.shape == AL_prew.shape)
        assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
        assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
    dAL_prew = dAL_prew / IMG_LABEL_NUM
    grads[f"dA{L - 1}"] = dAL_prew
    # L-2, L-1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]  # cache[1]
        dA_prew, dW, db = linear_activation_backward_with_dropout(grads[f"dA{c}"], cache, 'relu')
        if c > 1:
            cache_prew = caches[c - 2]
            ((_, _, _, _,), D_prew) = cache_prew
            # reversed dropout
            dA_prew = dA_prew * D_prew
            dA_prew = dA_prew / keep_prob
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def backward_propagation_with_regularization(ALS, Y, caches, CACHELS, lambd):
    m = Y.shape[2]
    grads = {}
    L = len(caches) + 1  # w1,w2,w3
    dAL_prew = None
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        dAL_prew = np.zeros_like(AL_prew)
        break
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
        dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
        grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
        grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
        dAL_prew += np.dot(WL.T, dZL)
        assert (dAL_prew.shape == AL_prew.shape)
        assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
        assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
    # w2, w1
    # 均值dA
    dAL_prew = dAL_prew / IMG_LABEL_NUM
    grads[f"dA{L - 1}"] = dAL_prew
    # w2, w1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward_with_regularization(grads[f"dA{c}"], cache, 'relu', lambd)
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def linear_activation_backward_with_regularization(dA, current_cache, activation, lambd):
    m = dA.shape[1]
    (A_prew, W, b, Z) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def linear_activation_backward_with_regularization_dropout(dA, current_cache, activation, lambd):
    m = dA.shape[1]
    ((A_prew, W, b, Z), D) = current_cache
    if activation == 'relu':
        dZ = 1 * dA
        dZ[Z <= 0] = 0
    else:
        assert (1 != 1), f"not support backward activation:{activation}"
    # Z[L] = W[L].A[L-1] + b[L]
    dW = np.dot(dZ, A_prew.T) / m + (lambd * W) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prew = np.dot(W.T, dZ)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    assert (dA_prew.shape == A_prew.shape)
    return dA_prew, dW, db


def backward_propagation_with_regularization_dropout(ALS, Y, caches, CACHELS, lambd, keep_prob):
    m = Y.shape[2]
    grads = {}
    L = len(caches) + 1  # w1,w2,w3
    dAL_prew = None
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        dAL_prew = np.zeros_like(AL_prew)
        break
    for label_index in range(IMG_LABEL_NUM):
        cacheL = CACHELS[f"CACHEL_{label_index}"]
        (AL_prew, WL, bL, ZL) = cacheL
        # 多分类交叉熵损失函数，对softmax求偏导以及softmax对z求偏导的综合结果为：dL/dz = a - y
        dZL = (ALS[f"AL_{label_index}"] - Y[label_index])
        grads[f"dW{L}_{label_index}"] = 1 / m * np.dot(dZL, AL_prew.T) + (lambd * WL) / m
        grads[f"db{L}_{label_index}"] = 1 / m * np.sum(dZL, axis=1, keepdims=True)  # Z = W.A[l-1] + b
        dAL_prew += np.dot(WL.T, dZL)
        assert (dAL_prew.shape == AL_prew.shape)
        assert (grads[f"dW{L}_{label_index}"].shape == WL.shape)
        assert (grads[f"db{L}_{label_index}"].shape == bL.shape)
    # w2, w1
    # 均值dA
    dAL_prew = dAL_prew / IMG_LABEL_NUM
    grads[f"dA{L - 1}"] = dAL_prew
    # w2, w1
    for c in reversed(range(1, L)):
        cache = caches[c - 1]
        dA_prew, dW, db = linear_activation_backward_with_regularization_dropout(grads[f"dA{c}"], cache, 'relu', lambd)
        if c > 1:
            cache_prew = caches[c - 2]
            ((_, _, _, _,), D_prew) = cache_prew
            # reversed dropout
            dA_prew = dA_prew * D_prew
            dA_prew = dA_prew / keep_prob
        grads[f"dA{c - 1}"], grads[f"dW{c}"], grads[f"db{c}"] = dA_prew, dW, db
    return grads


def update_parameters(grads, parameters, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        w_substract = learning_rate * grads[f"dW{l}"]
        parameters[f"W{l}"] = parameters[f"W{l}"] - w_substract
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads[f"db{l}"]
    return parameters


def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    L = len(parameters) // 2

    for l in range(L):
        v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]
        v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]

        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)]

    return parameters, v


def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
    L = len(parameters) // 2 - IMG_LABEL_NUM + 1
    v_corrected = {}
    s_corrected = {}
    for l in range(1, L):
        v[f"dW{l}"] = beta1 * v[f"dW{l}"] + (1 - beta1) * grads[f"dW{l}"]
        v[f"db{l}"] = beta1 * v[f"db{l}"] + (1 - beta1) * grads[f"db{l}"]
        v_corrected[f"dW{l}"] = v[f"dW{l}"] / (1 - np.power(beta1, t))
        v_corrected[f"db{l}"] = v[f"db{l}"] / (1 - np.power(beta1, t))

        s[f"dW{l}"] = beta2 * s[f"dW{l}"] + (1 - beta2) * np.power(grads[f"dW{l}"], 2)
        s[f"db{l}"] = beta2 * s[f"db{l}"] + (1 - beta2) * np.power(grads[f"db{l}"], 2)
        s_corrected[f"dW{l}"] = s[f"dW{l}"] / (1 - np.power(beta2, t))
        s_corrected[f"db{l}"] = s[f"db{l}"] / (1 - np.power(beta2, t))

        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v_corrected[f"dW{l}"] / np.sqrt(
            s_corrected[f"dW{l}"] + epsilon)
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v_corrected[f"db{l}"] / np.sqrt(
            s_corrected[f"db{l}"] + epsilon)
    for label_index in range(IMG_LABEL_NUM):
        v[f"dW{L}_{label_index}"] = beta1 * v[f"dW{L}_{label_index}"] + (1 - beta1) * grads[f"dW{L}_{label_index}"]
        v[f"db{L}_{label_index}"] = beta1 * v[f"db{L}_{label_index}"] + (1 - beta1) * grads[f"db{L}_{label_index}"]
        v_corrected[f"dW{L}_{label_index}"] = v[f"dW{L}_{label_index}"] / (1 - np.power(beta1, t))
        v_corrected[f"db{L}_{label_index}"] = v[f"db{L}_{label_index}"] / (1 - np.power(beta1, t))

        s[f"dW{L}_{label_index}"] = beta2 * s[f"dW{L}_{label_index}"] + (1 - beta2) * np.power(
            grads[f"dW{L}_{label_index}"], 2)
        s[f"db{L}_{label_index}"] = beta2 * s[f"db{L}_{label_index}"] + (1 - beta2) * np.power(
            grads[f"db{L}_{label_index}"], 2)
        s_corrected[f"dW{L}_{label_index}"] = s[f"dW{L}_{label_index}"] / (1 - np.power(beta2, t))
        s_corrected[f"db{L}_{label_index}"] = s[f"db{L}_{label_index}"] / (1 - np.power(beta2, t))

        parameters[f"W{L}_{label_index}"] = parameters[f"W{L}_{label_index}"] - learning_rate * v_corrected[
            f"dW{L}_{label_index}"] / np.sqrt(
            s_corrected[f"dW{L}_{label_index}"] + epsilon)
        parameters[f"b{L}_{label_index}"] = parameters[f"b{L}_{label_index}"] - learning_rate * v_corrected[
            f"db{L}_{label_index}"] / np.sqrt(
            s_corrected[f"db{L}_{label_index}"] + epsilon)
    return parameters, v, s


def build_model(X_train, Y_train, layers_dims, learning_rate=0.0001, num_epochs=1000, mini_batch_size=32, beta1=0.9,
                beat2=0.999, epsilon=1e-8, gd="sgd", lambd=0.7, keep_prob=0.8, print_cost=True, print_mini_cost=False,
                begin=0):
    t = 0
    m = X_train.shape[1]
    costs = []
    seed = 88
    parameters = initialize_parameters(begin, layers_dims)
    v, s = initialize_adam(parameters)
    num_mini_batches = int(m / mini_batch_size)
    for epoch in range(begin, num_epochs):
        seed = seed + 1
        mini_batches = random_mini_batches(X_train, Y_train, mini_batch_size, seed)
        epoch_cost = 0.
        mini_batch_index = 0
        for mini_batch in mini_batches:
            (mini_batch_X, mini_batch_Y) = mini_batch
            if keep_prob == 1:
                caches, ALS, CACHELS = forward_propagation(mini_batch_X, mini_batch_Y, parameters)
            else:
                caches, ALS, CACHELS = forward_propagation_with_dropout(mini_batch_X, mini_batch_Y, parameters,
                                                                        keep_prob)
            if lambd == 0:
                mini_batch_cost = compute_cost(ALS, mini_batch_Y)
                if keep_prob == 1:
                    grads = backward_propagation(ALS, mini_batch_Y, caches, CACHELS)
                else:
                    grads = backward_propagation_with_dropout(ALS, mini_batch_Y, caches, CACHELS, keep_prob)
            else:
                mini_batch_cost = compute_cost_with_regularization(ALS, mini_batch_Y, parameters, lambd)
                if keep_prob == 1:
                    grads = backward_propagation_with_regularization(ALS, mini_batch_Y, caches, CACHELS, lambd)
                else:
                    grads = backward_propagation_with_regularization_dropout(ALS, mini_batch_Y, caches, CACHELS, lambd,
                                                                             keep_prob)
            t = t + 1
            parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate,
                                                           beta1, beat2, epsilon)
            epoch_cost += mini_batch_cost / num_mini_batches
            if print_mini_cost and mini_batch_index % 2 == 0:
                print("[{}/{}]Mini batch {} cost: {}".format(epoch, num_epochs, mini_batch_index, mini_batch_cost))
            mini_batch_index += 1
        if print_cost == True and epoch % TRAIN_RECORD_STEP == 0 and epoch > 0:
            get_logger().info(f"Cost after epoch {epoch}: {epoch_cost}")
            # accuracy(epoch, parameters)
            save_parameters(h5_name=f"epoch_{epoch}.h5", parameters=parameters)
        if print_cost == True and epoch > 0:
            costs.append(epoch_cost)
    return costs, parameters


def save_parameters(h5_name, parameters):
    with h5py.File(h5_name, 'w') as f:
        for key, value in parameters.items():
            f.create_dataset(key, data=value)


def read_parameters(h5_name):
    parameters = {}
    with h5py.File(h5_name, 'r') as f:
        for key in f.keys():
            parameters[key] = np.array(f[key])
    return parameters


def predict(X, Y, parameters):
    _, ALS, _ = forward_propagation(X, Y, parameters)
    predictions = np.zeros((IMG_LABEL_NUM, Y.shape[2]))
    for label_index in range(IMG_LABEL_NUM):
        AL = ALS[f"AL_{label_index}"]
        prediction = np.argmax(AL, axis=0).reshape(1, -1)
        predictions[label_index] = prediction
    return predictions


def predict_one(parameters_name):
    parameters = read_parameters(parameters_name)
    np.random.seed(None)
    permutation = np.random.permutation(30)
    X_random = X_verify[:, permutation]
    Y_random = Y_verify[:, permutation]
    for index in range(0, 9):
        plt.subplot(191 + index)
        X_ = np.float32(X_random[:, index].reshape(-1, 1))
        Y_ = np.float32(Y_random[:, index].reshape(-1, 1))
        plt.imshow(X_.reshape(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL))
        A3, _ = forward_propagation(X_, Y_, parameters)
        print(f"样本{index}预测结果：", label_to_char(np.argmax(A3)))
    plt.show()


def show_cost(costs, learning_rate, lamd, keep_prob, layers_dims, num_epochs, mini_batch_size):
    plt.plot(costs)
    title = f"learning_reate={learning_rate}"
    # title = f"layers_dims={layers_dims}"
    # , lamd={lamd}, keep_prob={keep_prob}, layers_dims:{layers_dims}, num_epochs={num_epochs},mini_batch_size={mini_batch_size}"
    plt.title(title)
    plt.ylabel("cost")
    plt.xlabel("iterations (per tens)")


def accuracy(epoch_num, parameters):
    train_predictions = predict(X_train, Y_train, parameters)
    train_accuracy = np.sum(train_predictions == Y_train_orig) / Y_train_orig.size
    # 验证集准确率
    verify_predictions = predict(X_verify, Y_verify, parameters)
    verify_accuracy = np.sum(verify_predictions == Y_verify_orig) / Y_verify_orig.size
    print(
        f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")
    get_logger().info(
        f"训练 {epoch_num}次, 对训练数据准确率为:{round(train_accuracy * 100, 2)}%, 对验证数据准确率为:{round(verify_accuracy * 100, 2)}%")


def show_accuracy(num_epochs):
    for i in range(0, num_epochs, TRAIN_RECORD_STEP):
        if i == num_epochs:
            return
        parameters_name = f"epoch_{i + TRAIN_RECORD_STEP}.h5"
        parameters = read_parameters(parameters_name)
        accuracy(i, parameters)


def start(begin, num_epochs, drop_out, lambd, learning_rate, mini_batch_size, layers_dims, train_ratio, reload_data,
          sample_path):
    keep_prob = 1 - drop_out
    if os.path.exists("./costs.log"):
        os.remove("./costs.log")
    X_train_orig, Y_train_orig, X_verify_orig, Y_verify_orig = load_data_set(refresh=reload_data,
                                                                             train_ratio=train_ratio,
                                                                             sample_path=sample_path)
    X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1)
    X_verify_flatten = X_verify_orig.reshape(X_verify_orig.shape[0], -1)
    # 简单归一化

    X_train = X_train_flatten / 255
    X_verify = X_verify_flatten / 255
    Y_train = np.zeros((IMG_LABEL_NUM, len(LABELS), X_train.shape[1]))
    Y_verify = np.zeros((IMG_LABEL_NUM, len(LABELS), X_verify.shape[1]))
    for label_index in range(IMG_LABEL_NUM):
        Y_train[label_index] = convert_to_one_hot(Y_train_orig[label_index], len(LABELS))
        Y_verify[label_index] = convert_to_one_hot(Y_verify_orig[label_index], len(LABELS))

    costs, parameters = build_model(X_train, Y_train, layers_dims, num_epochs=num_epochs, gd="adam",
                                    keep_prob=keep_prob,
                                    lambd=lambd, mini_batch_size=mini_batch_size, print_mini_cost=True, begin=begin,
                                    learning_rate=learning_rate)
    show_accuracy(500)


if __name__ == "__main__":
    begin = 0
    num_epochs = 2000
    drop_out = 0
    lambd = 1
    learning_rate = 0.0001
    mini_batch_size = 32
    layers_dims = [IMG_HEIGHT * IMG_WIDTH * IMG_CHANNEL, 120, 72, len(LABELS)]
    train_ratio = 0.8
    sample_path = "/tmp/samples"
    start(begin, num_epochs, drop_out, lambd, learning_rate, mini_batch_size, layers_dims, train_ratio,
          reload_data=True, sample_path=sample_path)