机器学习:线性判别分析LDA(Python)

一、线性判别分析的定义

二、线性判别分析------二分类模型

lda2classify.py

python 复制代码
import numpy as np


class LDABinaryClassifier:
    """
    线性判别分析二分类模型
    """
    def __init__(self):
        self.mu = None  # 各类别均值向量
        self.Sw_i = None  # 各类内散度矩阵
        self.Sw = None  # 类内散度矩阵(within-class scatter matrix)
        self.weight = None  # 模型的系数,投影方向
        self.w0 = None  # 阈值

    def fit(self, x_train, y_train):
        """
        线性判别分析核心算法,计算投影方向及判别阈值
        :param x_train: 训练集
        :param y_train: 目标集
        :return:
        """
        x_train, y_train = np.asarray(x_train), np.asarray(y_train)
        class_values = np.sort(np.unique(y_train))  # 不同的类别取值
        n_samples, n_features = x_train.shape  # 样本量和特征变量数
        class_size = []  # 计算各类别的样本量
        if len(class_values) != 2:
            raise ValueError("仅限于二分类且线性可分数据集......")
        # 1. 计算类均值,Sw散度矩阵,Sb散度矩阵
        self.Sw_i = dict()  # 字典形式,以类别取值为键,值是对应的类别样本的类内散度矩阵
        self.mu = dict()  # 字典形式,以类别取值为键,值是对应的类别样本的均值向量
        self.Sw = np.zeros((n_features, n_features))
        for label_val in class_values:
            class_x = x_train[y_train == label_val]  # 按类别对样本进行划分
            class_size.append(class_x.shape[0])  # 该类别的样本量
            self.mu[label_val] = np.mean(class_x, axis=0)  # 对特征取均值构成均值向量
            self.Sw_i[label_val] = (class_x - self.mu[label_val]).T.dot(class_x - self.mu[label_val])
            self.Sw += self.Sw_i[label_val]  # 累加计算类内散度矩阵
        # print(self.Sw)

        # 2. 计算投影方向w
        # u, sigma, v = np.linalg.svd(self.Sw)  # 奇异值分解
        # inv_sw = v * np.linalg.inv(np.diag(sigma)) * u.T  # 求逆矩阵
        inv_sw = np.linalg.inv(self.Sw)
        self.weight = inv_sw.dot(self.mu[0] - self.mu[1])  # 投影方向
        # print(self.weight)

        # 3. 计算阈值w0
        self.w0 = (class_size[0] * self.weight.dot(self.mu[0]) + class_size[1] * self.weight.dot(self.mu[1])) / n_samples
        # print(self.w0)

        return self.weight

    def predict(self, x_test):
        """
        根据测试样本
        :param x_test:
        :return:
        """
        x_test = np.asarray(x_test)
        y_pred = self.weight.dot(x_test.T) - self.w0
        y_test_pred = np.zeros(x_test.shape[0], dtype=np.int64)  # 初始测试样本的类别值
        y_test_pred[y_pred < 0] = 1  # 小于阈值的为负类
        return y_test_pred

test_lda2classify.py

python 复制代码
from sklearn.datasets import load_iris, load_breast_cancer
from lda2classify import LDABinaryClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# iris = load_iris()
# X, y = iris.data[:100, :], iris.target[:100]

bc_data = load_breast_cancer()
X, y = bc_data.data, bc_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111, stratify=y)

lda = LDABinaryClassifier()
lda.fit(X_train, y_train)
y_test_pred = lda.predict(X_test)
print(classification_report(y_test, y_test_pred))

鸢尾花取前两类:

breast_cancer 数据集:

三、线性判别分析------多分类降维算法

lda_multi_dim_reduction.py

python 复制代码
import numpy as np
import scipy as sp

class LDAMulti_DimReduction:
    """
    线性判别分析多分类降维
    """
    def __init__(self, n_components=2):
        self.n_components = n_components  # 降维后的维度
        self.Sw, self.Sb = None, None
        self.eig_values = None # 广义特征值
        self.W = None  # 投影矩阵

    def fit(self, x_samples, y_target):
        """
        线性判别分析多分类降维核心算法,计算投影矩阵
        :param x_train:
        :param y_train:
        :return:
        """
        x_samples, y_target = np.asarray(x_samples), np.asarray(y_target)
        class_values = np.sort(np.unique(y_target))  # 不同的类别取值
        n_samples, n_features = x_samples.shape  # 样本量和特征变量数
        self.Sw = np.zeros((n_features, n_features))
        for i in range(len(class_values)):
            class_x = x_samples[y_target == class_values[i]]
            mu = np.mean(class_x, axis=0)
            self.Sw += (class_x - mu).T.dot(class_x - mu)
        mu_t = np.mean(x_samples, axis=0)
        self.Sb = (x_samples - mu_t).T.dot(x_samples - mu_t) - self.Sw
        self.eig_values, eig_vec = sp.linalg.eig(self.Sb, self.Sw)
        # print(self.eig_values)
        idx = np.argsort(self.eig_values)[::-1]  # 从大到小
        self.eig_values = self.eig_values[idx]
        vec_sort = eig_vec[:, idx]
        self.W = vec_sort[:, :self.n_components]
        # print(self.W)
        return self.W

    def transform(self, x_samples):
        """
        根据投影矩阵计算降维后的新样本数据
        :param x_samples:
        :return:
        """
        if self.W is not None:
            return x_samples.dot(self.W)
        else:
            raise ValueError("请先进行fit,构造投影矩阵,然后降维...")

    def fit_transform(self, x_samples, y_target):
        """
        计算投影矩阵并降维
        :param x_samples:
        :param y_target:
        :return:
        """
        self.fit(x_samples, y_target)
        return x_samples.dot(self.W)

    def variance_explained(self):
        """
        解释方差比
        :return:
        """
        idx = np.argwhere(np.imag(self.eig_values) != 0)
        if len(idx) == 0:
            self.eig_values = np.real(self.eig_values)
        ratio = self.eig_values / np.sum(self.eig_values)
        return ratio[:self.n_components]

test_lda_dim_reduction.py

python 复制代码
from sklearn.datasets import load_iris, load_wine, make_classification
from lda_multi_dim_reduction import LDAMulti_DimReduction
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler


iris = load_iris()
X, y = iris.data, iris.target

# X, y = make_classification(n_samples=2000, n_features=20, n_informative=3, n_classes=5,
#                            n_redundant=0, n_clusters_per_class=1, class_sep=2, random_state=42)
# 
# wine = load_wine()
# X, y = wine.data, wine.target

X = StandardScaler().fit_transform(X)

lda = LDAMulti_DimReduction(n_components=3)
lda.fit(X, y)
x_new = lda.transform(X)
print(lda.variance_explained())

plt.figure(figsize=(14, 5))

plt.subplot(121)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(222)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
# plt.grid(ls=":")

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
x_skl = lda.transform(X)
plt.subplot(122)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(224)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
# plt.grid(ls=":")

plt.tight_layout()
plt.show()

鸢尾花数据集:

降维后前两个主特征的解释方差比

9.91212605e-01 8.78739503e-03

红酒数据集:

降维后前两个主特征的解释方差比

6.87478888e-01 3.12521112e-01

使用make_classification创建数据集:

降维后前三个主特征的解释方差比

0.47101585 0.44946339 0.07876534

相关推荐
幼儿园园霸柒柒10 分钟前
第七章:7.2求方程a*x*x+b*x+c=0的根,用3个函数,分别求当:b*b-4*a*c大于0、等于0和小于0时的根并输出结果。从主函数输入a、b、c的值
c语言·开发语言·算法·c#
恶霸不委屈12 分钟前
突破精度极限!基于DeepSeek的无人机航拍图像智能校准系统技术解析
人工智能·python·无人机·deepseek
u01037310616 分钟前
Django REST Framework (DRF)
后端·python·django
雨中夜归人18 分钟前
自动化测试工具playwright中文文档-------14.Chrome 插件
python·测试工具·自动化·pytest·playwright
阳洞洞27 分钟前
leetcode 213. House Robber II
算法·leetcode·动态规划
梭七y27 分钟前
【力扣hot100题】(099)寻找重复数
算法·leetcode·职场和发展
小媛早点睡41 分钟前
贪心算法day11(用最少数量的箭引爆气球)
算法·贪心算法
飞天狗11142 分钟前
数据结构——二叉树
数据结构·算法
lixy5791 小时前
深度学习之自动微分
人工智能·python·深度学习
神经星星1 小时前
【TVM教程】microTVM TFLite 指南
人工智能·机器学习·编程语言