第四章决策树

例题3.5

代码实现1

复制代码

import pandas as pd
from math import log2

class ID3DecisionTree:
    """ID3决策树算法实现（仅文本输出）"""
    def __init__(self):
        self.tree = None  # 存储决策树（字典结构）
        self.feature_names = None  # 特征名称列表
        self.y_train = None  # 训练集标签（用于预测时的默认值）

    def calc_entropy(self, y):
        """计算信息熵"""
        label_counts = y.value_counts()  # 统计每个类别的数量
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total
            if prob > 0:  # 避免log2(0)的情况
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value  # 筛选符合条件的样本
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)  # 移除当前特征
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择信息增益最大的特征"""
        num_features = X.shape[1]
        base_entropy = self.calc_entropy(y)  # 原始数据集的熵
        best_info_gain = 0.0
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()  # 特征的所有可能值
            new_entropy = 0.0  # 划分后的条件熵

            # 计算每个子数据集的熵并累加
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                new_entropy += prob * self.calc_entropy(sub_y)

            # 信息增益 = 原始熵 - 条件熵
            info_gain = base_entropy - new_entropy

            # 更新最优特征
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票：返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1：所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2：无特征可划分，返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点（字典结构：{特征: {特征值: 子树}}）
        tree = {best_feature_name: {}}
        # 剩余特征名称（排除当前使用的特征）
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型（构建决策树）"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点：直接返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点：递归查找特征值对应的子树
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            # 特征值未见过时，返回训练集多数标签
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """递归打印决策树（文本形式）"""
        if tree is None:
            tree = self.tree

        # 叶子节点：直接打印标签
        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        # 决策节点：打印特征及子树
        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        # 遍历每个特征值对应的子树
        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            # 最后一个子节点用不同符号，优化显示
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例：种子发芽数据测试
if __name__ == "__main__":
    # 1. 构建数据集（5个"是"，4个"否"）
    data = {
        "形状": ["圆形", "圆形", "皱形", "皱形", "圆形", "皱形", "圆形", "皱形", "圆形"],
        "颜色": ["灰色", "白色", "灰色", "白色", "白色", "白色", "白色", "灰色", "灰色"],
        "大小": ["皱缩", "饱满", "皱缩", "饱满", "饱满", "饱满", "饱满", "皱缩", "皱缩"],
        "土壤": ["酸性", "酸性", "碱性", "酸性", "酸性", "碱性", "酸性", "碱性", "碱性"],
        "水分": ["多", "少", "多", "多", "少", "少", "少", "多", "多"],
        "日照": ["12h以上", "12h以上", "12h以上", "12h以下", "12h以下", "12h以上", "12h以下", "12h以下", "12h以上"],
        "发芽": ["否", "是", "否", "是", "是", "是", "是", "否", "否"]
    }
    df = pd.DataFrame(data)
    X = df.drop("发芽", axis=1)  # 特征
    y = df["发芽"]                # 标签

    # 2. 训练决策树
    print("=== 训练ID3决策树 ===")
    id3 = ID3DecisionTree()
    id3.fit(X, y)

    # 3. 打印决策树结构（文本形式）
    print("\n=== 决策树结构（文本形式） ===")
    id3.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "形状": ["圆形"], "颜色": ["白色"], "大小": ["饱满"],
        "土壤": ["碱性"], "水分": ["多"], "日照": ["12h以下"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征：")
    print(test_data.T)  # 转置显示更清晰
    print(f"预测结果：{id3.predict(test_data).iloc[0]}")

运行结果：

复制代码

=== 训练ID3决策树 ===

=== 决策树结构（文本形式） ===
┌── 特征: 颜色
│   ├── 取值: 灰色
│      └── 结果: 否
│   └── 取值: 白色
       └── 结果: 是

=== 测试数据预测 ===
测试数据特征：
        0
形状     圆形
颜色     白色
大小     饱满
土壤     碱性
水分      多
日照  12h以下
预测结果：是

代码实现2-基于基尼系数

复制代码

import pandas as pd

class GiniDecisionTree:
    """基于基尼系数的决策树（CART算法思想）"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称
        self.y_train = None  # 训练标签

    def calc_gini(self, y):
        """计算基尼系数（基尼不纯度）"""
        # 基尼系数公式：Gini = 1 - Σ(p_i²)，p_i为第i类的概率
        label_counts = y.value_counts()
        total = len(y)
        gini = 1.0
        for count in label_counts:
            prob = count / total
            gini -= prob ** 2
        return gini

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择基尼指数最小（最优）的特征"""
        num_features = X.shape[1]
        best_gini_index = float('inf')  # 最小基尼指数（初始为无穷大）
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()
            gini_index = 0.0  # 基尼指数（加权平均基尼系数）

            # 计算每个子数据集的基尼系数并加权求和
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                gini_index += prob * self.calc_gini(sub_y)

            # 基尼指数越小，特征划分效果越好
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票法：返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1：所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2：无特征可划分，返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征（基尼指数最小）
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点
        tree = {best_feature_name: {}}
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点：返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点：递归查找
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树（文本形式）"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例：种子发芽数据测试
if __name__ == "__main__":
    # 1. 数据集（5个"是"，4个"否"）
    data = {
        "形状": ["圆形", "圆形", "皱形", "皱形", "圆形", "皱形", "圆形", "皱形", "圆形"],
        "颜色": ["灰色", "白色", "灰色", "白色", "白色", "白色", "白色", "灰色", "灰色"],
        "大小": ["皱缩", "饱满", "皱缩", "饱满", "饱满", "饱满", "饱满", "皱缩", "皱缩"],
        "土壤": ["酸性", "酸性", "碱性", "酸性", "酸性", "碱性", "酸性", "碱性", "碱性"],
        "水分": ["多", "少", "多", "多", "少", "少", "少", "多", "多"],
        "日照": ["12h以上", "12h以上", "12h以上", "12h以下", "12h以下", "12h以上", "12h以下", "12h以下", "12h以上"],
        "发芽": ["否", "是", "否", "是", "是", "是", "是", "否", "否"]
    }
    df = pd.DataFrame(data)
    X = df.drop("发芽", axis=1)  # 特征
    y = df["发芽"]                # 标签

    # 2. 训练基于基尼系数的决策树
    print("=== 训练基于基尼系数的决策树 ===")
    gini_tree = GiniDecisionTree()
    gini_tree.fit(X, y)

    # 3. 打印决策树结构
    print("\n=== 决策树结构（文本形式） ===")
    gini_tree.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "形状": ["圆形"], "颜色": ["白色"], "大小": ["饱满"],
        "土壤": ["碱性"], "水分": ["多"], "日照": ["12h以下"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征：")
    print(test_data.T)
    print(f"预测结果：{gini_tree.predict(test_data).iloc[0]}")

运行结果：

=== 训练基于基尼系数的决策树 ===

=== 决策树结构（文本形式） ===

┌── 特征: 颜色

│ ├── 取值: 灰色

│ └── 结果: 否

│ └── 取值: 白色

└── 结果: 是

=== 测试数据预测 ===

测试数据特征：

形状圆形

颜色白色

大小饱满

土壤碱性

水分多

日照 12h以下

预测结果：是

例题3.6

代码实现-ID3

复制代码

import pandas as pd
from math import log2

class ID3DecisionTree:
    """ID3决策树算法实现（用于判断是否感冒）"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称列表
        self.y_train = None  # 训练标签

    def calc_entropy(self, y):
        """计算信息熵（衡量数据集不确定性）"""
        label_counts = y.value_counts()  # 统计每个类别的数量
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total  # 计算概率
            if prob > 0:  # 避免log2(0)的情况
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value  # 筛选符合条件的样本
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)  # 移除当前特征
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择信息增益最大的特征"""
        num_features = X.shape[1]
        base_entropy = self.calc_entropy(y)  # 原始数据集的熵
        best_info_gain = 0.0
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()  # 特征的所有可能值
            new_entropy = 0.0  # 划分后的条件熵

            # 计算每个子数据集的熵并加权求和
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                new_entropy += prob * self.calc_entropy(sub_y)

            # 信息增益 = 原始熵 - 条件熵
            info_gain = base_entropy - new_entropy

            # 更新最优特征
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票：返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1：所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2：无特征可划分，返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点（字典结构：{特征: {特征值: 子树}}）
        tree = {best_feature_name: {}}
        # 剩余特征名称（排除当前使用的特征）
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型（构建决策树）"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点：直接返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点：递归查找特征值对应的子树
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            # 特征值未见过时，返回训练集多数标签
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树（文本形式）"""
        if tree is None:
            tree = self.tree

        # 叶子节点：直接打印标签
        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        # 决策节点：打印特征及子树
        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        # 遍历每个特征值对应的子树
        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            # 最后一个子节点用不同符号，优化显示
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例：感冒判断数据集测试
if __name__ == "__main__":
    # 1. 构建感冒判断数据集（16个样本：12个"是"，4个"否"）
    data = {
        "流鼻涕": ["是", "否", "是", "是", "否", "是", "是", "是", "否", "是", "是", "否", "否", "否", "否", "否"],
        "体温": ["较高", "非常高", "非常高", "正常", "正常", "较高", "较高", "非常高", "较高", "正常", "正常", "正常", "较高", "非常高", "非常高", "较高"],
        "肌肉疼": ["否", "否", "是", "是", "否", "是", "否", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "头疼": ["是", "否", "是", "否", "否", "是", "是", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "感冒": ["是", "否", "是", "是", "否", "是", "是", "是", "是", "否", "是", "是", "否", "是", "是", "是"]
    }
    df = pd.DataFrame(data)
    X = df.drop("感冒", axis=1)  # 特征（流鼻涕、体温、肌肉疼、头疼）
    y = df["感冒"]                # 标签（是否感冒）

    # 2. 训练ID3决策树
    print("=== 训练感冒判断决策树 ===")
    id3 = ID3DecisionTree()
    id3.fit(X, y)

    # 3. 打印决策树结构（文本形式）
    print("\n=== 决策树结构（文本形式） ===")
    id3.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "流鼻涕": ["否"],
        "体温": ["正常"],
        "肌肉疼": ["否"],
        "头疼": ["否"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征：")
    print(test_data.T)  # 转置显示更清晰
    print(f"预测结果（是否感冒）：{id3.predict(test_data).iloc[0]}")

运行结果

=== 训练感冒判断决策树 ===

=== 决策树结构（文本形式） ===

┌── 特征: 头疼

│ ├── 取值: 是

│ └── 结果: 是

│ └── 取值: 否

┌── 特征: 肌肉疼

│ ├── 取值: 否

│ ┌── 特征: 体温

│ │ ├── 取值: 非常高

│ │ └── 结果: 否

│ │ ├── 取值: 正常

│ │ ┌── 特征: 流鼻涕

│ │ │ ├── 取值: 否

│ │ │ └── 结果: 否

│ │ │ └── 取值: 是

│ │ └── 结果: 否

│ │ └── 取值: 较高

│ └── 结果: 否

│ └── 取值: 是

└── 结果: 是

=== 测试数据预测 ===

测试数据特征：

流鼻涕否

体温正常

肌肉疼否

头疼否

预测结果（是否感冒）：否

代码实现------C4.5

复制代码

import pandas as pd
import numpy as np
from math import log2

class C45DecisionTree:
    """C4.5决策树算法实现（基于信息增益率）"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称
        self.y_train = None  # 训练标签

    def calc_entropy(self, y):
        """计算信息熵"""
        label_counts = y.value_counts()
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total
            if prob > 0:
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value, is_continuous=False):
        """划分数据集（支持连续特征）"""
        if is_continuous:
            # 连续特征：按"<=value"和">value"划分
            mask = X.iloc[:, feature_idx] <= value
            sub_X1 = X.loc[mask].drop(X.columns[feature_idx], axis=1)
            sub_y1 = y.loc[mask]
            sub_X2 = X.loc[~mask].drop(X.columns[feature_idx], axis=1)
            sub_y2 = y.loc[~mask]
            return (sub_X1, sub_y1), (sub_X2, sub_y2)
        else:
            # 离散特征：按等于value划分
            mask = X.iloc[:, feature_idx] == value
            sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)
            sub_y = y.loc[mask]
            return sub_X, sub_y

    def calc_info_gain(self, X, y, feature_idx, is_continuous=False):
        """计算信息增益（支持连续特征）"""
        base_entropy = self.calc_entropy(y)
        total = len(y)
        info_gain = 0.0
        best_split_val = None  # 连续特征的最佳分割点

        if is_continuous:
            # 连续特征：排序后找最佳分割点
            feature_values = sorted(X.iloc[:, feature_idx].unique())
            max_info_gain = -float('inf')
            # 尝试所有可能的分割点（相邻值的中点）
            for i in range(len(feature_values) - 1):
                split_val = (feature_values[i] + feature_values[i+1]) / 2
                (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                    X, y, feature_idx, split_val, is_continuous=True
                )
                prob1 = len(sub_y1) / total
                prob2 = len(sub_y2) / total
                current_entropy = prob1 * self.calc_entropy(sub_y1) + prob2 * self.calc_entropy(sub_y2)
                current_info_gain = base_entropy - current_entropy
                if current_info_gain > max_info_gain:
                    max_info_gain = current_info_gain
                    best_split_val = split_val
            info_gain = max_info_gain
        else:
            # 离散特征：直接计算
            feature_values = X.iloc[:, feature_idx].unique()
            new_entropy = 0.0
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, feature_idx, value)
                prob = len(sub_y) / total
                new_entropy += prob * self.calc_entropy(sub_y)
            info_gain = base_entropy - new_entropy

        return info_gain, best_split_val

    def calc_split_info(self, X, y, feature_idx, is_continuous=False, split_val=None):
        """计算分裂信息（用于信息增益率）"""
        total = len(y)
        split_info = 0.0

        if is_continuous and split_val is not None:
            # 连续特征：按分割点计算
            (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                X, y, feature_idx, split_val, is_continuous=True
            )
            prob1 = len(sub_y1) / total
            prob2 = len(sub_y2) / total
            if prob1 > 0:
                split_info -= prob1 * log2(prob1)
            if prob2 > 0:
                split_info -= prob2 * log2(prob2)
        else:
            # 离散特征：按取值计算
            feature_values = X.iloc[:, feature_idx].unique()
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, feature_idx, value)
                prob = len(sub_y) / total
                if prob > 0:
                    split_info -= prob * log2(prob)

        return split_info

    def choose_best_feature(self, X, y, continuous_features=None):
        """选择信息增益率最大的特征（C4.5核心）"""
        if continuous_features is None:
            continuous_features = set()  # 记录连续特征的索引
        num_features = X.shape[1]
        best_gain_ratio = -float('inf')
        best_feature_idx = -1
        best_split_val = None  # 连续特征的最佳分割点

        for i in range(num_features):
            is_continuous = i in continuous_features
            # 计算信息增益
            info_gain, split_val = self.calc_info_gain(X, y, i, is_continuous)
            # 计算分裂信息（避免分母为0）
            split_info = self.calc_split_info(X, y, i, is_continuous, split_val)
            if split_info < 1e-10:  # 分裂信息接近0时跳过（避免除0）
                continue
            # 信息增益率 = 信息增益 / 分裂信息
            gain_ratio = info_gain / split_info

            # 更新最优特征
            if gain_ratio > best_gain_ratio:
                best_gain_ratio = gain_ratio
                best_feature_idx = i
                best_split_val = split_val

        return best_feature_idx, best_split_val, best_feature_idx in continuous_features

    def majority_vote(self, y):
        """多数投票法"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names, continuous_features=None):
        """递归构建决策树"""
        if continuous_features is None:
            continuous_features = set()

        # 终止条件1：所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2：无特征可划分，返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征（信息增益率最大）
        best_idx, best_split, is_continuous = self.choose_best_feature(
            X, y, continuous_features
        )
        best_name = feature_names[best_idx]

        # 处理连续特征（在特征名后添加分割点）
        if is_continuous:
            best_name = f"{best_name}<= {best_split:.2f}"

        tree = {best_name: {}}
        remaining_names = [f for i, f in enumerate(feature_names) if i != best_idx]
        remaining_continuous = {i for i in continuous_features if i != best_idx}
        # 调整连续特征索引（删除当前特征后索引减1）
        remaining_continuous = {i-1 if i > best_idx else i for i in remaining_continuous}

        if is_continuous:
            # 连续特征：分为<=split和>split两支
            (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                X, y, best_idx, best_split, is_continuous=True
            )
            # 递归构建子树
            tree[best_name]["是"] = self.build_tree(
                sub_X1, sub_y1, remaining_names, remaining_continuous
            )
            tree[best_name]["否"] = self.build_tree(
                sub_X2, sub_y2, remaining_names, remaining_continuous
            )
        else:
            # 离散特征：按每个取值构建子树
            feature_values = X.iloc[:, best_idx].unique()
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, best_idx, value)
                tree[best_name][value] = self.build_tree(
                    sub_X, sub_y, remaining_names, remaining_continuous
                )

        return tree

    def fit(self, X, y, feature_names=None, continuous_features=None):
        """训练模型"""
        if feature_names is None:
            feature_names = list(X.columns)
        if continuous_features is None:
            continuous_features = set()  # 连续特征的列名集合
        # 转换连续特征列名为索引
        self.continuous_idxs = {feature_names.index(f) for f in continuous_features}
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names, self.continuous_idxs)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            return tree

        # 解析当前节点（处理连续特征）
        current_node = next(iter(tree.keys()))
        if "<= " in current_node:
            # 连续特征：提取特征名和分割点
            feature_name, split_val = current_node.split("<= ")
            split_val = float(split_val)
            # 判断样本值是否满足条件
            sample_val = sample[feature_name]
            if sample_val <= split_val:
                return self.predict_single(sample, tree[current_node]["是"])
            else:
                return self.predict_single(sample, tree[current_node]["否"])
        else:
            # 离散特征：直接匹配取值
            feature_name = current_node
            sample_val = sample[feature_name]
            if sample_val in tree[current_node]:
                return self.predict_single(sample, tree[current_node][sample_val])
            else:
                return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树（文本形式）"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        node_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {node_name}")

        values = list(tree[node_name].keys())
        for i, value in enumerate(values):
            if i == len(values) - 1:
                print(f"{indent}│   └── 分支: {value}")
                self.print_tree(tree[node_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 分支: {value}")
                self.print_tree(tree[node_name][value], indent + "│      ")


# 示例：感冒判断数据集测试（含连续特征）
if __name__ == "__main__":
    # 1. 构建数据集（添加连续特征"体温值"）
    data = {
        "流鼻涕": ["是", "否", "是", "是", "否", "是", "是", "是", "否", "是", "是", "否", "否", "否", "否", "否"],
        "体温值": [37.5, 39.2, 39.5, 36.5, 36.3, 37.8, 37.6, 39.1, 37.7, 36.4, 36.6, 36.2, 37.9, 39.3, 39.4, 37.4],
        "肌肉疼": ["否", "否", "是", "是", "否", "是", "否", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "头疼": ["是", "否", "是", "否", "否", "是", "是", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "感冒": ["是", "否", "是", "是", "否", "是", "是", "是", "是", "否", "是", "是", "否", "是", "是", "是"]
    }
    df = pd.DataFrame(data)
    X = df.drop("感冒", axis=1)
    y = df["感冒"]

    # 2. 定义连续特征（体温值为连续特征）
    continuous_features = {"体温值"}

    # 3. 训练C4.5决策树
    print("=== 训练C4.5决策树 ===")
    c45 = C45DecisionTree()
    c45.fit(X, y, continuous_features=continuous_features)

    # 4. 打印决策树结构
    print("\n=== 决策树结构（文本形式） ===")
    c45.print_tree()

    # 5. 预测测试数据
    test_data = pd.DataFrame({
        "流鼻涕": ["否"],
        "体温值": [36.2],
        "肌肉疼": ["否"],
        "头疼": ["否"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征：")
    print(test_data.T)
    print(f"预测结果（是否感冒）：{c45.predict(test_data).iloc[0]}")

运行结果

=== 训练C4.5决策树 ===

=== 决策树结构（文本形式） ===

┌── 特征: 头疼

│ ├── 分支: 是

│ └── 结果: 是

│ └── 分支: 否

┌── 特征: 体温值<= 36.25

│ ├── 分支: 是

│ └── 结果: 是

│ └── 分支: 否

┌── 特征: 肌肉疼

│ ├── 分支: 否

│ └── 结果: 否

│ └── 分支: 是

└── 结果: 是

=== 测试数据预测 ===

测试数据特征：

流鼻涕否

体温值 36.2

肌肉疼否

头疼否

预测结果（是否感冒）：是

Process finished with exit code 0

第四章 决策树

例题3.5

代码实现1

运行结果：

代码实现2-基于基尼系数

运行结果：

例题3.6

代码实现-ID3

运行结果

代码实现------C4.5

运行结果

第四章决策树