机器学习：Logistic回归（Python）

Logistic回归（二分类）

logistic_regression_class2.py

python 复制代码

import numpy as np
import matplotlib.pyplot as plt


class LogisticRegression:
    """
    逻辑回归，采用梯度下降算法 + 正则化，交叉熵损失函数，实现二分类
    """
    def __init__(self, fit_intercept=True, normalize=True, alpha=0.05, eps=1e-10,
                 max_epochs=300, batch_size=20, l1_ratio=None, l2_ratio=None, en_rou=None):
        """
        :param eps: 提前停止训练的精度要求，按照两次训练损失的绝对值差小于eps，停止训练
        :param fit_intercept: 是否训练偏置项
        :param normalize: 是否标准化
        :param alpha: 学习率
        :param max_epochs: 最大迭代次数
        :param batch_size: 批量大小，若为1，则为随机梯度，若为训练集样本量，则为批量梯度，否则为小批量梯度
        :param l1_ratio: LASSO回归惩罚项系数
        :param l2_ratio: 岭回归惩罚项系数
        :param en_rou: 弹性网络权衡L1和L2的系数
        """
        self.fit_intercept = fit_intercept  # 线性模型的常数项。也即偏置bias，模型中的theta0
        self.normalize = normalize  # 是否标准化数据
        self.alpha = alpha  # 学习率
        self.eps = eps  # 提前停止训练
        if l1_ratio:
            if l1_ratio < 0:
                raise ValueError("惩罚项系数不能为负数")
        self.l1_ratio = l1_ratio  # LASSO回归惩罚项系数
        if l2_ratio:
            if l2_ratio < 0:
                raise ValueError("惩罚项系数不能为负数")
        self.l2_ratio = l2_ratio  # 岭回归惩罚项系数
        if en_rou:
            if en_rou > 1 or en_rou < 0:
                raise ValueError("弹性网络权衡系数范围在[0, 1]")
        self.en_rou = en_rou  # 弹性网络权衡L1和L2的系数
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.theta = None  # 训练权重系数
        if normalize:
            self.feature_mean, self.feature_std = None, None  # 特征的均值，标准方差
        self.n_samples, self.n_features = 0, 0  # 样本量和特征数
        self.train_loss, self.test_loss = [], []  # 存储训练过程中的训练损失和测试损失

    def init_theta_params(self, n_features):
        """
        初始化参数
        如果训练偏置项，也包含了bias的初始化
        :return:
        """
        self.theta = np.random.randn(n_features, 1) * 0.1

    @staticmethod
    def sigmoid(x):
        """
        sigmoid函数，为避免上溢或下溢，对参数x做限制
        :param x: 可能是标量数据，也可能是数组
        :return:
        """
        x = np.asarray(x)  # 为避免标量值的布尔索引出错，转换为数组
        x[x > 30.0] = 30.0  # 避免下溢
        x[x < -50] = -50.0  # 避免上溢
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sign_func(weight):
        """
        符号函数，针对L1正则化
        :param weight: 模型系数
        :return:
        """
        sign_values = np.zeros(weight.shape)
        sign_values[np.argwhere(weight > 0)] = 1  # np.argwhere(weight > 0) 返回值是索引下标
        sign_values[np.argwhere(weight < 0)] = -1
        return sign_values

    @staticmethod
    def cal_cross_entropy(y_test, y_prob):
        """
        计算交叉熵损失
        :param y_test: 样本真值
        :param y_prob: 模型预测类别概率
        :return:
        """
        loss = -(y_test.T.dot(np.log(y_prob)) + (1 - y_test).T.dot(np.log(1 - y_prob)))
        return loss

    def fit(self, x_train, y_train, x_test=None, y_test=None):
        """
        样本的预处理，模型系数的求解，闭式解公式 + 梯度方法
        :param x_train: 训练样本集 m*k
        :param y_train: 训练目标集 m*1
        :param x_test: 测试样本集 n*k
        :param y_test: 测试目标集 n*1
        :return:
        """
        if self.normalize:
            self.feature_mean = np.mean(x_train, axis=0)  # 样本均值
            self.feature_std = np.std(x_train, axis=0) + 1e-8  # 样本方差
            x_train = (x_train - self.feature_mean) / self.feature_std  # 标准化
            if x_test is not None:
                x_test = (x_test - self.feature_mean) / self.feature_std  # 标准化
        if self.fit_intercept:
            x_train = np.c_[x_train, np.ones_like(y_train)]  # 添加一列1，即偏置项样本
            if x_test is not None and y_test is not None:
                x_test = np.c_[x_test, np.ones_like(y_test)]  # 添加一列1，即偏置项样本
        self.init_theta_params(x_train.shape[1])  # 初始化参数
        # 训练模型
        self._fit_gradient_desc(x_train, y_train, x_test, y_test)  # 梯度下降法训练模型

    def _fit_gradient_desc(self, x_train, y_train, x_test=None, y_test=None):
        """
        三种梯度下降求解 + 正则化：
        （1）如果batch_size为1，则为随机梯度下降法
        （2）如果batch_size为样本量，则为批量梯度下降法
        （3）如果batch_size小于样本量，则为小批量梯度下降法
        :return:
        """
        train_sample = np.c_[x_train, y_train]  # 组合训练集和目标集，以便随机打乱样本
        # np.c_水平方向连接数组，np.r_竖直方向连接数组
        # 按batch_size更新theta，三种梯度下降法取决于batch_size的大小
        best_theta, best_mse = None, np.infty  # 最佳训练权重与验证均方误差
        for epoch in range(self.max_epochs):
            self.alpha *= 0.95
            np.random.shuffle(train_sample)  # 打乱样本顺序，模拟随机化
            batch_nums = train_sample.shape[0] // self.batch_size  # 批次
            for idx in range(batch_nums):
                # 取小批量样本，可以是随机梯度（1），批量梯度（n）或者是小批量梯度（<n）
                batch_xy = train_sample[self.batch_size * idx: self.batch_size * (idx + 1)]
                # 分取训练样本和目标样本，并保持维度
                batch_x, batch_y = batch_xy[:, :-1], batch_xy[:, -1:]
                # 计算权重更新增量，包含偏置项
                y_prob_batch = self.sigmoid(batch_x.dot(self.theta))  # 小批量的预测概率
                # 1 * n <--> n * k = 1 * k --> 转置 k * 1
                delta = ((y_prob_batch - batch_y).T.dot(batch_x) / self.batch_size).T
                # 计算并添加正则化部分，不包含偏置项
                dw_reg = np.zeros(shape=(x_train.shape[1] - 1, 1))
                if self.l1_ratio and self.l2_ratio is None:
                    # LASSO回归，L1正则化
                    dw_reg = self.l1_ratio * self.sign_func(self.theta[:-1])
                if self.l2_ratio and self.l1_ratio is None:
                    # Ridge回归，L2正则化
                    dw_reg = 2 * self.l2_ratio * self.theta[:-1]
                if self.en_rou and self.l1_ratio and self.l2_ratio:
                    # 弹性网络
                    dw_reg = self.l1_ratio * self.en_rou * self.sign_func(self.theta[:-1])
                    dw_reg += 2 * self.l2_ratio * (1 - self.en_rou) * self.theta[:-1]
                delta[:-1] += dw_reg / self.batch_size  # 添加了正则化
                self.theta = self.theta - self.alpha * delta
            # 计算训练过程中的交叉熵损失值
            y_train_prob = self.sigmoid(x_train.dot(self.theta))  # 当前迭代训练的模型预测概率
            train_cost = self.cal_cross_entropy(y_train, y_train_prob)  # 训练集的交叉熵损失
            self.train_loss.append(train_cost / x_train.shape[0])  # 交叉熵损失均值
            if x_test is not None and y_test is not None:
                y_test_prob = self.sigmoid(x_test.dot(self.theta))  # 当前测试样本预测概率
                test_cost = self.cal_cross_entropy(y_test, y_test_prob)
                self.test_loss.append(test_cost / x_test.shape[0])  # 交叉熵损失均值
            # 两次交叉熵损失均值的差异小于给定的均值，提前停止训练
            if epoch > 10 and (np.abs(self.train_loss[-1] - self.train_loss[-2])) <= self.eps:
                break

    def get_params(self):
        """
        返回线性模型训练的系数
        :return:
        """
        if self.fit_intercept:  # 存在偏置项
            weight, bias = self.theta[:-1], self.theta[-1]
        else:
            weight, bias = self.theta, np.array([0])
        if self.normalize:  # 标准化后的系数
            weight = weight / self.feature_std.reshape(-1, 1)  # 还原模型系数
            bias = bias - weight.T.dot(self.feature_mean)
        return weight.reshape(-1), bias

    def predict_prob(self, x_test):
        """
        预测测试样本的概率，第1列为y = 0的概率，第2列是y = 1的概率
        :param x_test: 测试样本，ndarray：n * k
        :return:
        """
        y_prob = np.zeros((x_test.shape[0], 2))  # 预测概率
        if self.normalize:
            x_test = (x_test - self.feature_mean) / self.feature_std  # 测试数据标准化
        if self.fit_intercept:
            # 存在偏置项，加一列1
            x_test = np.c_[x_test, np.ones(shape=x_test.shape[0])]
        y_prob[:, 1] = self.sigmoid(x_test.dot(self.theta)).reshape(-1)
        y_prob[:, 0] = 1 - y_prob[:, 1]  # 类别y = 0的概率
        return y_prob

    def predict(self, x, p=0.5):
        """
        预测样本类别，默认大于0.5为1，小于0.5为0
        :param x: 预测样本
        :param p: 概率阈值
        :return:
        """
        y_prob = self.predict_prob(x)
        # 布尔值转换为整数，true对应1，false对应0
        return (y_prob[:, 1] > p).astype(int)

    def plt_loss_curve(self, lab=None, is_show=True):
        """
        可视化交叉熵损失曲线
        :param is_show: 是否可视化
        :return:
        """
        if is_show:
            plt.figure(figsize=(8, 6))
        plt.plot(self.train_loss, "k-", lw=1, label="Train Loss")
        if self.test_loss:
            plt.plot(self.test_loss, "r--", lw=1.2, label="Test Loss")
        plt.xlabel("Training Epochs", fontdict={"fontsize": 12})
        plt.ylabel("The Mean of Cross Entropy Loss", fontdict={"fontsize": 12})
        plt.title("%s: The Loss Curve of Cross Entropy" % lab)
        plt.legend(frameon=False)
        plt.grid(ls=":")
        # plt.axis([0, 300, 20, 30])
        if is_show:
            plt.show()

performance_metrics.py

python 复制代码

import numpy as np  # 数值计算
import pandas as pd  # 数值分析
import matplotlib.pyplot as plt  # 可视化
import seaborn as sns


class ModelPerformanceMetrics:
    """
    模型性能度量，分二分类和多分类，模型的泛化性能度量
    1. 计算混淆矩阵
    2. 计算分类报告，模板采用sklearn.classification_report格式
    3. 计算P（查准率）R（查全率）指标，并可视化P---R曲线，计算AP
    4. 计算ROC的指标：真正例率，假正例率，并可视化ROC曲线，计算AUC
    5. 计算代价曲线，归一化指标、正例概率代价、可视化代价曲线，并计算期望总体代价
    """

    def __init__(self, y_true, y_prob):
        """
        初始化参数
        :param y_true: 样本的真实类别
        :param y_prob: 样本的预测类别概率
        """
        self.y_true = np.asarray(y_true, dtype=np.int64)
        self.y_prob = np.asarray(y_prob, np.float64)  # 列数与类别数一致
        self.n_samples, self.n_class = self.y_prob.shape  # 样本量和类别数
        if self.n_class > 2:
            self.y_true = self.label_one_hot()
        else:
            self.y_true = self.y_true.reshape(-1)
        self.cm = self.cal_confusion_matrix()   # 计算混淆矩阵

    def label_one_hot(self):
        """
        对真实类别标签进行one---hot编码，编码后的维度与模型预测概率维度一致
        :return: y_true_lab
        """
        y_true_lab = np.zeros((self.n_samples, self.n_class))
        for i in range(self.n_samples):
            y_true_lab[i, self.y_true[i]] = 1
        return y_true_lab

    def cal_confusion_matrix(self):
        """
        计算并构建混淆矩阵
        :return: confusion_matrix
        """
        confusion_matrix = np.zeros((self.n_class, self.n_class), dtype=np.int64)
        for i in range(self.n_samples):
            idx = np.argmax(self.y_prob[i, :])  # 最大概率所对应的索引，即是类别
            if self.n_class == 2:
                idx_true = self.y_true[i]   # 第i个样本的真实类别
            else:
                idx_true = np.argmax(self.y_true[i, :])
            if idx_true == idx:
                confusion_matrix[idx, idx] += 1   # 预测正确，则在对角线位置加1
            else:
                confusion_matrix[idx_true, idx] += 1   # 预测错误，则在真实类别行，预测错误列加1
        return confusion_matrix

    def cal_classification_report(self, target_names=None):
        """
        计算并构造分类报告
        :param self:
        :return:
        """
        precision = np.diag(self.cm) / np.sum(self.cm, axis=0)  # 查准率
        recall = np.diag(self.cm) / np.sum(self.cm, axis=1)     # 查全率
        f1_score = 2 * precision * recall / (precision + recall)  # F1调和平均
        support = np.sum(self.cm, axis=1, dtype=np)   # 各个类别的支持样本量
        support_all = np.sum(support)  # 总的样本量
        accuracy = np.sum(np.diag(self.cm))  / support_all  # 准确率
        p_m, r_m = precision.mean(), recall.mean()
        macro_avg = [p_m, r_m, 2 * p_m * r_m / (p_m + r_m)]  # 宏指标
        weight = support / support_all  # 以各个类别的样本量所占总的样本量比例为权重
        weighted_avg = [np.sum(weight * precision), np.sum(weight * recall), np.sum(weight * f1_score)]

        # 构造分类报告
        metrics_1 = pd.DataFrame(np.array([precision, recall, f1_score, support]).T,
                                 columns=["precision", "recall", "f1_score", "support"])
        metrics_2 = pd.DataFrame([["", "", "", ""], ["", "", accuracy, support_all],
                                 np.hstack([macro_avg, support_all]),
                                 np.hstack([weighted_avg, support_all])],
                                 columns=["precision", "recall", "f1_score", "support"])
        c_report = pd.concat([metrics_1, metrics_2], ignore_index=False)
        if target_names is None:  # 类别标签未传参，则默认类别标签为0、1、2...
            target_names = [str(i) for i in range(self.n_class)]
        else:
            target_names = list(target_names)
        target_names.extend(["", "accuracy", "macro_avg", "weighted_avg"])
        c_report.index = target_names
        return c_report

    @staticmethod
    def __sort_positive__(y_prob):
        """
        按照预测为正例的概率进行降序排列，并返回排序的索引向量
        :param y_prob: 一维数组，样本预测为正例的概率
        :return:
        """
        idx = np.argsort(y_prob)[::-1]  # 降序排列
        return idx

    def precision_recall_curve(self):
        """
        Precision和Recall曲线，计算各坐标点的值，可视化P---R曲线
        :return:
        """
        pr_array = np.zeros((self.n_samples, 2))  # 存储每个样本预测概率作为阈值时的P和R指标
        if self.n_class == 2: # 二分类
            idx = self.__sort_positive__(self.y_prob[:, 0])  # 降序排列索引
            y_true = self.y_true[idx]  # 真值类别标签按照排序索引进行排序
            # 针对每个样本，把预测概率作为阈值，计算各指标
            for i in range(self.n_samples):
                tp, fn, tn, fp = self.__cal_sub_metrics__(y_true, i + 1)
                pr_array[i, :] = tp / (tp + fn), tp / (tp + fp)
        else:
            precision = np.zeros((self.n_samples, self.n_class))  # 查准率
            recall = np.zeros((self.n_samples, self.n_class))  # 查全率
            for k in range(self.n_class):  # 针对每个类别，分别计算P、R指标，然后平均
                idx = self.__sort_positive__(self.y_prob[:, k])
                y_true_k = self.y_true[:, k]  # 真值类别第k列
                y_true = y_true_k[idx]  # 对第k个类别的真值排序
                # 针对每个样本，把预测概率作为阈值，计算各指标
                for i in range(self.n_samples):
                    tp, fn, tn, fp = self.__cal_sub_metrics__(y_true, i + 1)
                    precision[i, k] = tp / (tp + fp)  # 查准率
                    recall[i, k] = tp / (tp + fn)  # 查全率
            pr_array = np.array([np.mean(recall, axis=1), np.mean(precision, axis=1)]).T
        return pr_array

    def roc_metrics_curve(self):
        """
        ROC曲线，计算真正例率和假正例率，并可视化
        :return:
        """
        roc_array = np.zeros((self.n_samples, 2))  # 存储每个样本预测概率作为阈值时的TPR和FPR指标
        if self.n_class == 2:  # 二分类
            idx = self.__sort_positive__(self.y_prob[:, 0])  # 降序排列索引
            y_true = self.y_true[idx]  # 真值类别标签按照排序索引进行排序
            # 针对每个样本，把预测概率作为阈值，计算各指标
            n_nums, p_nums = len(y_true[y_true == 1]), len(y_true[y_true == 0])  # 真实类别中反例与正例的样本量
            tp, fn, tn, fp = self.__cal_sub_metrics__(y_true, 1)
            roc_array[0, :] = fp / (tn + fp), tp / (tp + fn)
            for i in range(1, self.n_samples):
                #tp, fn, tn, fp = self.__cal_sub_metrics__(y_true, i + 1)
                if y_true[i] == 1:
                    roc_array[i, :] = roc_array[i - 1, 0] + 1 / n_nums, roc_array[i - 1, 1]
                else:
                    roc_array[i, :] = roc_array[i - 1, 0], roc_array[i - 1, 1] + 1 / p_nums
                #roc_array[i, :] = fp / (tn + fp), tp / (tp + fn)
        else:  # 多分类
            precision = np.zeros((self.n_samples, self.n_class))  # 查准率
            recall = np.zeros((self.n_samples, self.n_class))  # 查全率
            for k in range(self.n_class):  # 针对每个类别，分别计算P、R指标，然后平均
                idx = self.__sort_positive__(self.y_prob[:, k])
                y_true_k = self.y_true[:, k]  # 真值类别第k列
                y_true = y_true_k[idx]  # 对第k个类别的真值排序
                # 针对每个样本，把预测概率作为阈值，计算各指标
                for i in range(self.n_samples):
                    tp, fn, tn, fp = self.__cal_sub_metrics__(y_true, i + 1)
                    precision[i, k] = tp / (tp + fp)  # 查准率
                    recall[i, k] = tp / (tp + fn)  # 查全率
            roc_array = np.array([np.mean(recall, axis=1), np.mean(precision, axis=1)]).T
        return roc_array

    def __cal_sub_metrics__(self, y_true_sort, n):
        """
        计算TP、TN、FP、TN
        :param y_true_sort: 排序后的真实类别
        :param n: 以第n个样本预测概率为阈值
        :return:
        """
        if self.n_class == 2:
            pre_label = np.r_[np.zeros(n, dtype=np.int64), np.ones(self.n_samples - n, dtype=np.int64)]
            tp = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)])  # 真正例
            tn = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)])  # 真反例
            fp = np.sum(y_true_sort) - tn  # 假正例
            fn = self.n_samples - tp - tn - fp  # 假反例
        else:
            pre_label = np.r_[np.ones(n, dtype=np.int64), np.zeros(self.n_samples - n, dtype=np.int64)]
            tp = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)])  # 真正例
            tn = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)])  # 真反例
            fn = np.sum(y_true_sort) - tp  # 假正例
            fp = self.n_samples - tp - tn - fn  # 假反例
        return tp, fn, tn, fp

    @staticmethod
    def __cal_ap__(pr_val):
        """
        计算AP
        :param pr_val: PR指标各坐标点的数组
        :return:
        """
        return (pr_val[1:, 0] - pr_val[0:-1, 0]).dot(pr_val[1:, 1])

    @staticmethod
    def __cal_auc__(roc_val):
        """
        计算ROC曲线下的面积，即AUC
        :param roc_val:
        :return:
        """
        return (roc_val[1:, 0] - roc_val[0:-1, 0]).dot(roc_val[:-1, 1] + roc_val[1:, 1]) / 2

    def plt_pr_curve(self, pr_val, label=None, is_show=True):
        """
        可视化PR曲线
        :param pr_val: PR指标各坐标点的数组
        :return:
        """
        ap = self.__cal_ap__(pr_val)
        if is_show:
            plt.figure(figsize=(7, 5))
        if label:
            plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post",
                     label = label + ", AP = %.3f" % ap)
        else:
            plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post")
        plt.title("Precision Recall Curve of Test Samples and AP = %.3f" % ap)
        plt.xlabel("Recall", fontdict={"fontsize": 12})
        plt.ylabel("Precision", fontdict={"fontsize": 12})
        plt.grid(ls=":")
        plt.legend(frameon=False)
        if is_show:
            plt.show()

    def plt_roc_curve(self, roc_val, label=None, is_show=True):
        """
        可视化ROC曲线
        :param roc_val: ROC指标各坐标点的数组
        :return:
        """
        auc = self.__cal_auc__(roc_val)
        if is_show:
            plt.figure(figsize=(7, 5))
        if label:
            plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post",
                     label = label + ", AP = %.3f" % auc)
        else:
            plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post")
        plt.title("ROC Curve of Test Samples and AUC = %.3f" % auc)
        plt.xlabel("False Positive Rate", fontdict={"fontsize": 12})
        plt.ylabel("True Positive Rate", fontdict={"fontsize": 12})
        plt.grid(ls=":")
        plt.legend(frameon=False)
        if is_show:
            plt.show()

    @staticmethod
    def plt_confusion_matrix(confusion_matrix, label_names=None, is_show=True):
        """
        可视化混淆矩阵
        :param confusion_matrix: 混淆矩阵
        :return:
        """
        sns.set()
        cm = pd.DataFrame(confusion_matrix, columns=label_names, index=label_names)
        sns.heatmap(cm, annot=True, cbar=False)
        acc = np.diag(confusion_matrix).sum() / confusion_matrix.sum()
        plt.title("Confusion Matrix and ACC = %.5f" % acc)
        plt.xlabel("Predict", fontdict={"fontsize": 12})
        plt.ylabel("True", fontdict={"fontsize": 12})
        if is_show:
            plt.show()

test_logistic_reg_2.py

python 复制代码

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from logistic_regression_2class import LogisticRegression
import matplotlib.pyplot as plt
from performance_metrics import ModelPerformanceMetrics


bc_data = load_breast_cancer()  # 加载数据集
X, y = bc_data.data, bc_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lg_lr = LogisticRegression(alpha=0.5, l1_ratio=0.5, batch_size=20, max_epochs=1000, eps=1e-15)
lg_lr.fit(X_train, y_train, X_test, y_test)

print("L1正则化模型参数如下：")
theta = lg_lr.get_params()
fn = bc_data.feature_names
for i, w in enumerate(theta[0]):
    print(fn[i], ":", w)
print("theta0:", theta[1])

print("=" * 70)
y_test_prob = lg_lr.predict_prob(X_test)  # 预测概率
y_test_labels = lg_lr.predict(X_test)

plt.figure(figsize=(12, 8))
plt.subplot(221)
lg_lr.plt_loss_curve(lab="L1", is_show=False)

pm = ModelPerformanceMetrics(y_test, y_test_prob)
print(pm.cal_classification_report())

pr_values = pm.precision_recall_curve()  # PR指标值
plt.subplot(222)
pm.plt_pr_curve(pr_values, is_show=False)  # PR曲线

roc_values = pm.roc_metrics_curve()  # ROC指标值
plt.subplot(223)
pm.plt_roc_curve(roc_values, is_show=False)  # ROC曲线

plt.subplot(224)
cm = pm.cal_confusion_matrix()
pm.plt_confusion_matrix(cm, label_names=["malignant", "benign"], is_show=False)

plt.tight_layout()
plt.show()

输出结果：