机器学习:线性判别分析LDA(Python)

一、线性判别分析的定义

二、线性判别分析------二分类模型

lda2classify.py

python 复制代码
import numpy as np


class LDABinaryClassifier:
    """
    线性判别分析二分类模型
    """
    def __init__(self):
        self.mu = None  # 各类别均值向量
        self.Sw_i = None  # 各类内散度矩阵
        self.Sw = None  # 类内散度矩阵(within-class scatter matrix)
        self.weight = None  # 模型的系数,投影方向
        self.w0 = None  # 阈值

    def fit(self, x_train, y_train):
        """
        线性判别分析核心算法,计算投影方向及判别阈值
        :param x_train: 训练集
        :param y_train: 目标集
        :return:
        """
        x_train, y_train = np.asarray(x_train), np.asarray(y_train)
        class_values = np.sort(np.unique(y_train))  # 不同的类别取值
        n_samples, n_features = x_train.shape  # 样本量和特征变量数
        class_size = []  # 计算各类别的样本量
        if len(class_values) != 2:
            raise ValueError("仅限于二分类且线性可分数据集......")
        # 1. 计算类均值,Sw散度矩阵,Sb散度矩阵
        self.Sw_i = dict()  # 字典形式,以类别取值为键,值是对应的类别样本的类内散度矩阵
        self.mu = dict()  # 字典形式,以类别取值为键,值是对应的类别样本的均值向量
        self.Sw = np.zeros((n_features, n_features))
        for label_val in class_values:
            class_x = x_train[y_train == label_val]  # 按类别对样本进行划分
            class_size.append(class_x.shape[0])  # 该类别的样本量
            self.mu[label_val] = np.mean(class_x, axis=0)  # 对特征取均值构成均值向量
            self.Sw_i[label_val] = (class_x - self.mu[label_val]).T.dot(class_x - self.mu[label_val])
            self.Sw += self.Sw_i[label_val]  # 累加计算类内散度矩阵
        # print(self.Sw)

        # 2. 计算投影方向w
        # u, sigma, v = np.linalg.svd(self.Sw)  # 奇异值分解
        # inv_sw = v * np.linalg.inv(np.diag(sigma)) * u.T  # 求逆矩阵
        inv_sw = np.linalg.inv(self.Sw)
        self.weight = inv_sw.dot(self.mu[0] - self.mu[1])  # 投影方向
        # print(self.weight)

        # 3. 计算阈值w0
        self.w0 = (class_size[0] * self.weight.dot(self.mu[0]) + class_size[1] * self.weight.dot(self.mu[1])) / n_samples
        # print(self.w0)

        return self.weight

    def predict(self, x_test):
        """
        根据测试样本
        :param x_test:
        :return:
        """
        x_test = np.asarray(x_test)
        y_pred = self.weight.dot(x_test.T) - self.w0
        y_test_pred = np.zeros(x_test.shape[0], dtype=np.int64)  # 初始测试样本的类别值
        y_test_pred[y_pred < 0] = 1  # 小于阈值的为负类
        return y_test_pred

test_lda2classify.py

python 复制代码
from sklearn.datasets import load_iris, load_breast_cancer
from lda2classify import LDABinaryClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# iris = load_iris()
# X, y = iris.data[:100, :], iris.target[:100]

bc_data = load_breast_cancer()
X, y = bc_data.data, bc_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111, stratify=y)

lda = LDABinaryClassifier()
lda.fit(X_train, y_train)
y_test_pred = lda.predict(X_test)
print(classification_report(y_test, y_test_pred))

鸢尾花取前两类:

breast_cancer 数据集:

三、线性判别分析------多分类降维算法

lda_multi_dim_reduction.py

python 复制代码
import numpy as np
import scipy as sp

class LDAMulti_DimReduction:
    """
    线性判别分析多分类降维
    """
    def __init__(self, n_components=2):
        self.n_components = n_components  # 降维后的维度
        self.Sw, self.Sb = None, None
        self.eig_values = None # 广义特征值
        self.W = None  # 投影矩阵

    def fit(self, x_samples, y_target):
        """
        线性判别分析多分类降维核心算法,计算投影矩阵
        :param x_train:
        :param y_train:
        :return:
        """
        x_samples, y_target = np.asarray(x_samples), np.asarray(y_target)
        class_values = np.sort(np.unique(y_target))  # 不同的类别取值
        n_samples, n_features = x_samples.shape  # 样本量和特征变量数
        self.Sw = np.zeros((n_features, n_features))
        for i in range(len(class_values)):
            class_x = x_samples[y_target == class_values[i]]
            mu = np.mean(class_x, axis=0)
            self.Sw += (class_x - mu).T.dot(class_x - mu)
        mu_t = np.mean(x_samples, axis=0)
        self.Sb = (x_samples - mu_t).T.dot(x_samples - mu_t) - self.Sw
        self.eig_values, eig_vec = sp.linalg.eig(self.Sb, self.Sw)
        # print(self.eig_values)
        idx = np.argsort(self.eig_values)[::-1]  # 从大到小
        self.eig_values = self.eig_values[idx]
        vec_sort = eig_vec[:, idx]
        self.W = vec_sort[:, :self.n_components]
        # print(self.W)
        return self.W

    def transform(self, x_samples):
        """
        根据投影矩阵计算降维后的新样本数据
        :param x_samples:
        :return:
        """
        if self.W is not None:
            return x_samples.dot(self.W)
        else:
            raise ValueError("请先进行fit,构造投影矩阵,然后降维...")

    def fit_transform(self, x_samples, y_target):
        """
        计算投影矩阵并降维
        :param x_samples:
        :param y_target:
        :return:
        """
        self.fit(x_samples, y_target)
        return x_samples.dot(self.W)

    def variance_explained(self):
        """
        解释方差比
        :return:
        """
        idx = np.argwhere(np.imag(self.eig_values) != 0)
        if len(idx) == 0:
            self.eig_values = np.real(self.eig_values)
        ratio = self.eig_values / np.sum(self.eig_values)
        return ratio[:self.n_components]

test_lda_dim_reduction.py

python 复制代码
from sklearn.datasets import load_iris, load_wine, make_classification
from lda_multi_dim_reduction import LDAMulti_DimReduction
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler


iris = load_iris()
X, y = iris.data, iris.target

# X, y = make_classification(n_samples=2000, n_features=20, n_informative=3, n_classes=5,
#                            n_redundant=0, n_clusters_per_class=1, class_sep=2, random_state=42)
# 
# wine = load_wine()
# X, y = wine.data, wine.target

X = StandardScaler().fit_transform(X)

lda = LDAMulti_DimReduction(n_components=3)
lda.fit(X, y)
x_new = lda.transform(X)
print(lda.variance_explained())

plt.figure(figsize=(14, 5))

plt.subplot(121)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(222)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
# plt.grid(ls=":")

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
x_skl = lda.transform(X)
plt.subplot(122)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(224)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
# plt.grid(ls=":")

plt.tight_layout()
plt.show()

鸢尾花数据集:

降维后前两个主特征的解释方差比

9.91212605e-01 8.78739503e-03

红酒数据集:

降维后前两个主特征的解释方差比

6.87478888e-01 3.12521112e-01

使用make_classification创建数据集:

降维后前三个主特征的解释方差比

0.47101585 0.44946339 0.07876534

相关推荐
眼镜哥(with glasses)14 分钟前
蓝桥杯 国赛2024python(b组)题目(1-3)
数据结构·算法·蓝桥杯
老胖闲聊3 小时前
Python Copilot【代码辅助工具】 简介
开发语言·python·copilot
Blossom.1183 小时前
使用Python和Scikit-Learn实现机器学习模型调优
开发语言·人工智能·python·深度学习·目标检测·机器学习·scikit-learn
曹勖之4 小时前
基于ROS2,撰写python脚本,根据给定的舵-桨动力学模型实现动力学更新
开发语言·python·机器人·ros2
scdifsn4 小时前
动手学深度学习12.7. 参数服务器-笔记&练习(PyTorch)
pytorch·笔记·深度学习·分布式计算·数据并行·参数服务器
lyaihao5 小时前
使用python实现奔跑的线条效果
python·绘图
郄堃Deep Traffic5 小时前
机器学习+城市规划第十四期:利用半参数地理加权回归来实现区域带宽不同的规划任务
人工智能·机器学习·回归·城市规划
int型码农5 小时前
数据结构第八章(一) 插入排序
c语言·数据结构·算法·排序算法·希尔排序
UFIT5 小时前
NoSQL之redis哨兵
java·前端·算法
喜欢吃燃面5 小时前
C++刷题:日期模拟(1)
c++·学习·算法