第四章 决策树

例题3.5

代码实现1

复制代码
import pandas as pd
from math import log2

class ID3DecisionTree:
    """ID3决策树算法实现(仅文本输出)"""
    def __init__(self):
        self.tree = None  # 存储决策树(字典结构)
        self.feature_names = None  # 特征名称列表
        self.y_train = None  # 训练集标签(用于预测时的默认值)

    def calc_entropy(self, y):
        """计算信息熵"""
        label_counts = y.value_counts()  # 统计每个类别的数量
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total
            if prob > 0:  # 避免log2(0)的情况
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value  # 筛选符合条件的样本
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)  # 移除当前特征
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择信息增益最大的特征"""
        num_features = X.shape[1]
        base_entropy = self.calc_entropy(y)  # 原始数据集的熵
        best_info_gain = 0.0
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()  # 特征的所有可能值
            new_entropy = 0.0  # 划分后的条件熵

            # 计算每个子数据集的熵并累加
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                new_entropy += prob * self.calc_entropy(sub_y)

            # 信息增益 = 原始熵 - 条件熵
            info_gain = base_entropy - new_entropy

            # 更新最优特征
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票:返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1:所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2:无特征可划分,返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点(字典结构:{特征: {特征值: 子树}})
        tree = {best_feature_name: {}}
        # 剩余特征名称(排除当前使用的特征)
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型(构建决策树)"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点:直接返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点:递归查找特征值对应的子树
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            # 特征值未见过时,返回训练集多数标签
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """递归打印决策树(文本形式)"""
        if tree is None:
            tree = self.tree

        # 叶子节点:直接打印标签
        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        # 决策节点:打印特征及子树
        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        # 遍历每个特征值对应的子树
        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            # 最后一个子节点用不同符号,优化显示
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例:种子发芽数据测试
if __name__ == "__main__":
    # 1. 构建数据集(5个"是",4个"否")
    data = {
        "形状": ["圆形", "圆形", "皱形", "皱形", "圆形", "皱形", "圆形", "皱形", "圆形"],
        "颜色": ["灰色", "白色", "灰色", "白色", "白色", "白色", "白色", "灰色", "灰色"],
        "大小": ["皱缩", "饱满", "皱缩", "饱满", "饱满", "饱满", "饱满", "皱缩", "皱缩"],
        "土壤": ["酸性", "酸性", "碱性", "酸性", "酸性", "碱性", "酸性", "碱性", "碱性"],
        "水分": ["多", "少", "多", "多", "少", "少", "少", "多", "多"],
        "日照": ["12h以上", "12h以上", "12h以上", "12h以下", "12h以下", "12h以上", "12h以下", "12h以下", "12h以上"],
        "发芽": ["否", "是", "否", "是", "是", "是", "是", "否", "否"]
    }
    df = pd.DataFrame(data)
    X = df.drop("发芽", axis=1)  # 特征
    y = df["发芽"]                # 标签

    # 2. 训练决策树
    print("=== 训练ID3决策树 ===")
    id3 = ID3DecisionTree()
    id3.fit(X, y)

    # 3. 打印决策树结构(文本形式)
    print("\n=== 决策树结构(文本形式) ===")
    id3.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "形状": ["圆形"], "颜色": ["白色"], "大小": ["饱满"],
        "土壤": ["碱性"], "水分": ["多"], "日照": ["12h以下"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征:")
    print(test_data.T)  # 转置显示更清晰
    print(f"预测结果:{id3.predict(test_data).iloc[0]}")

运行结果:

复制代码
=== 训练ID3决策树 ===

=== 决策树结构(文本形式) ===
┌── 特征: 颜色
│   ├── 取值: 灰色
│      └── 结果: 否
│   └── 取值: 白色
       └── 结果: 是

=== 测试数据预测 ===
测试数据特征:
        0
形状     圆形
颜色     白色
大小     饱满
土壤     碱性
水分      多
日照  12h以下
预测结果:是

代码实现2-基于基尼系数

复制代码
import pandas as pd

class GiniDecisionTree:
    """基于基尼系数的决策树(CART算法思想)"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称
        self.y_train = None  # 训练标签

    def calc_gini(self, y):
        """计算基尼系数(基尼不纯度)"""
        # 基尼系数公式:Gini = 1 - Σ(p_i²),p_i为第i类的概率
        label_counts = y.value_counts()
        total = len(y)
        gini = 1.0
        for count in label_counts:
            prob = count / total
            gini -= prob ** 2
        return gini

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择基尼指数最小(最优)的特征"""
        num_features = X.shape[1]
        best_gini_index = float('inf')  # 最小基尼指数(初始为无穷大)
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()
            gini_index = 0.0  # 基尼指数(加权平均基尼系数)

            # 计算每个子数据集的基尼系数并加权求和
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                gini_index += prob * self.calc_gini(sub_y)

            # 基尼指数越小,特征划分效果越好
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票法:返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1:所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2:无特征可划分,返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征(基尼指数最小)
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点
        tree = {best_feature_name: {}}
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点:返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点:递归查找
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树(文本形式)"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例:种子发芽数据测试
if __name__ == "__main__":
    # 1. 数据集(5个"是",4个"否")
    data = {
        "形状": ["圆形", "圆形", "皱形", "皱形", "圆形", "皱形", "圆形", "皱形", "圆形"],
        "颜色": ["灰色", "白色", "灰色", "白色", "白色", "白色", "白色", "灰色", "灰色"],
        "大小": ["皱缩", "饱满", "皱缩", "饱满", "饱满", "饱满", "饱满", "皱缩", "皱缩"],
        "土壤": ["酸性", "酸性", "碱性", "酸性", "酸性", "碱性", "酸性", "碱性", "碱性"],
        "水分": ["多", "少", "多", "多", "少", "少", "少", "多", "多"],
        "日照": ["12h以上", "12h以上", "12h以上", "12h以下", "12h以下", "12h以上", "12h以下", "12h以下", "12h以上"],
        "发芽": ["否", "是", "否", "是", "是", "是", "是", "否", "否"]
    }
    df = pd.DataFrame(data)
    X = df.drop("发芽", axis=1)  # 特征
    y = df["发芽"]                # 标签

    # 2. 训练基于基尼系数的决策树
    print("=== 训练基于基尼系数的决策树 ===")
    gini_tree = GiniDecisionTree()
    gini_tree.fit(X, y)

    # 3. 打印决策树结构
    print("\n=== 决策树结构(文本形式) ===")
    gini_tree.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "形状": ["圆形"], "颜色": ["白色"], "大小": ["饱满"],
        "土壤": ["碱性"], "水分": ["多"], "日照": ["12h以下"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征:")
    print(test_data.T)
    print(f"预测结果:{gini_tree.predict(test_data).iloc[0]}")

运行结果:

=== 训练基于基尼系数的决策树 ===

=== 决策树结构(文本形式) ===

┌── 特征: 颜色

│ ├── 取值: 灰色

│ └── 结果: 否

│ └── 取值: 白色

└── 结果: 是

=== 测试数据预测 ===

测试数据特征:

0

形状 圆形

颜色 白色

大小 饱满

土壤 碱性

水分 多

日照 12h以下

预测结果:是

例题3.6

代码实现-ID3

复制代码
import pandas as pd
from math import log2

class ID3DecisionTree:
    """ID3决策树算法实现(用于判断是否感冒)"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称列表
        self.y_train = None  # 训练标签

    def calc_entropy(self, y):
        """计算信息熵(衡量数据集不确定性)"""
        label_counts = y.value_counts()  # 统计每个类别的数量
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total  # 计算概率
            if prob > 0:  # 避免log2(0)的情况
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value):
        """根据特征和值划分数据集"""
        mask = X.iloc[:, feature_idx] == value  # 筛选符合条件的样本
        sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)  # 移除当前特征
        sub_y = y.loc[mask]
        return sub_X, sub_y

    def choose_best_feature(self, X, y):
        """选择信息增益最大的特征"""
        num_features = X.shape[1]
        base_entropy = self.calc_entropy(y)  # 原始数据集的熵
        best_info_gain = 0.0
        best_feature_idx = -1

        for i in range(num_features):
            feature_values = X.iloc[:, i].unique()  # 特征的所有可能值
            new_entropy = 0.0  # 划分后的条件熵

            # 计算每个子数据集的熵并加权求和
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, i, value)
                prob = len(sub_y) / len(y)  # 子数据集占比
                new_entropy += prob * self.calc_entropy(sub_y)

            # 信息增益 = 原始熵 - 条件熵
            info_gain = base_entropy - new_entropy

            # 更新最优特征
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_idx = i

        return best_feature_idx

    def majority_vote(self, y):
        """多数投票:返回出现次数最多的标签"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names):
        """递归构建决策树"""
        # 终止条件1:所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2:无特征可划分,返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征
        best_feature_idx = self.choose_best_feature(X, y)
        best_feature_name = feature_names[best_feature_idx]

        # 初始化决策树节点(字典结构:{特征: {特征值: 子树}})
        tree = {best_feature_name: {}}
        # 剩余特征名称(排除当前使用的特征)
        remaining_feature_names = [f for i, f in enumerate(feature_names) if i != best_feature_idx]

        # 递归构建子树
        feature_values = X.iloc[:, best_feature_idx].unique()
        for value in feature_values:
            sub_X, sub_y = self.split_dataset(X, y, best_feature_idx, value)
            tree[best_feature_name][value] = self.build_tree(sub_X, sub_y, remaining_feature_names)

        return tree

    def fit(self, X, y, feature_names=None):
        """训练模型(构建决策树)"""
        if feature_names is None:
            feature_names = list(X.columns)
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        # 叶子节点:直接返回标签
        if not isinstance(tree, dict):
            return tree

        # 决策节点:递归查找特征值对应的子树
        feature_name = next(iter(tree.keys()))
        feature_value = sample[feature_name]

        if feature_value in tree[feature_name]:
            return self.predict_single(sample, tree[feature_name][feature_value])
        else:
            # 特征值未见过时,返回训练集多数标签
            return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树(文本形式)"""
        if tree is None:
            tree = self.tree

        # 叶子节点:直接打印标签
        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        # 决策节点:打印特征及子树
        feature_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {feature_name}")

        # 遍历每个特征值对应的子树
        values = list(tree[feature_name].keys())
        for i, value in enumerate(values):
            # 最后一个子节点用不同符号,优化显示
            if i == len(values) - 1:
                print(f"{indent}│   └── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 取值: {value}")
                self.print_tree(tree[feature_name][value], indent + "│      ")


# 示例:感冒判断数据集测试
if __name__ == "__main__":
    # 1. 构建感冒判断数据集(16个样本:12个"是",4个"否")
    data = {
        "流鼻涕": ["是", "否", "是", "是", "否", "是", "是", "是", "否", "是", "是", "否", "否", "否", "否", "否"],
        "体温": ["较高", "非常高", "非常高", "正常", "正常", "较高", "较高", "非常高", "较高", "正常", "正常", "正常", "较高", "非常高", "非常高", "较高"],
        "肌肉疼": ["否", "否", "是", "是", "否", "是", "否", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "头疼": ["是", "否", "是", "否", "否", "是", "是", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "感冒": ["是", "否", "是", "是", "否", "是", "是", "是", "是", "否", "是", "是", "否", "是", "是", "是"]
    }
    df = pd.DataFrame(data)
    X = df.drop("感冒", axis=1)  # 特征(流鼻涕、体温、肌肉疼、头疼)
    y = df["感冒"]                # 标签(是否感冒)

    # 2. 训练ID3决策树
    print("=== 训练感冒判断决策树 ===")
    id3 = ID3DecisionTree()
    id3.fit(X, y)

    # 3. 打印决策树结构(文本形式)
    print("\n=== 决策树结构(文本形式) ===")
    id3.print_tree()

    # 4. 预测测试数据
    test_data = pd.DataFrame({
        "流鼻涕": ["否"],
        "体温": ["正常"],
        "肌肉疼": ["否"],
        "头疼": ["否"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征:")
    print(test_data.T)  # 转置显示更清晰
    print(f"预测结果(是否感冒):{id3.predict(test_data).iloc[0]}")

运行结果

=== 训练感冒判断决策树 ===

=== 决策树结构(文本形式) ===

┌── 特征: 头疼

│ ├── 取值: 是

│ └── 结果: 是

│ └── 取值: 否

┌── 特征: 肌肉疼

│ ├── 取值: 否

│ ┌── 特征: 体温

│ │ ├── 取值: 非常高

│ │ └── 结果: 否

│ │ ├── 取值: 正常

│ │ ┌── 特征: 流鼻涕

│ │ │ ├── 取值: 否

│ │ │ └── 结果: 否

│ │ │ └── 取值: 是

│ │ └── 结果: 否

│ │ └── 取值: 较高

│ └── 结果: 否

│ └── 取值: 是

└── 结果: 是

=== 测试数据预测 ===

测试数据特征:

0

流鼻涕 否

体温 正常

肌肉疼 否

头疼 否

预测结果(是否感冒):否

代码实现------C4.5

复制代码
import pandas as pd
import numpy as np
from math import log2

class C45DecisionTree:
    """C4.5决策树算法实现(基于信息增益率)"""
    def __init__(self):
        self.tree = None  # 存储决策树
        self.feature_names = None  # 特征名称
        self.y_train = None  # 训练标签

    def calc_entropy(self, y):
        """计算信息熵"""
        label_counts = y.value_counts()
        entropy = 0.0
        total = len(y)
        for count in label_counts:
            prob = count / total
            if prob > 0:
                entropy -= prob * log2(prob)
        return entropy

    def split_dataset(self, X, y, feature_idx, value, is_continuous=False):
        """划分数据集(支持连续特征)"""
        if is_continuous:
            # 连续特征:按"<=value"和">value"划分
            mask = X.iloc[:, feature_idx] <= value
            sub_X1 = X.loc[mask].drop(X.columns[feature_idx], axis=1)
            sub_y1 = y.loc[mask]
            sub_X2 = X.loc[~mask].drop(X.columns[feature_idx], axis=1)
            sub_y2 = y.loc[~mask]
            return (sub_X1, sub_y1), (sub_X2, sub_y2)
        else:
            # 离散特征:按等于value划分
            mask = X.iloc[:, feature_idx] == value
            sub_X = X.loc[mask].drop(X.columns[feature_idx], axis=1)
            sub_y = y.loc[mask]
            return sub_X, sub_y

    def calc_info_gain(self, X, y, feature_idx, is_continuous=False):
        """计算信息增益(支持连续特征)"""
        base_entropy = self.calc_entropy(y)
        total = len(y)
        info_gain = 0.0
        best_split_val = None  # 连续特征的最佳分割点

        if is_continuous:
            # 连续特征:排序后找最佳分割点
            feature_values = sorted(X.iloc[:, feature_idx].unique())
            max_info_gain = -float('inf')
            # 尝试所有可能的分割点(相邻值的中点)
            for i in range(len(feature_values) - 1):
                split_val = (feature_values[i] + feature_values[i+1]) / 2
                (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                    X, y, feature_idx, split_val, is_continuous=True
                )
                prob1 = len(sub_y1) / total
                prob2 = len(sub_y2) / total
                current_entropy = prob1 * self.calc_entropy(sub_y1) + prob2 * self.calc_entropy(sub_y2)
                current_info_gain = base_entropy - current_entropy
                if current_info_gain > max_info_gain:
                    max_info_gain = current_info_gain
                    best_split_val = split_val
            info_gain = max_info_gain
        else:
            # 离散特征:直接计算
            feature_values = X.iloc[:, feature_idx].unique()
            new_entropy = 0.0
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, feature_idx, value)
                prob = len(sub_y) / total
                new_entropy += prob * self.calc_entropy(sub_y)
            info_gain = base_entropy - new_entropy

        return info_gain, best_split_val

    def calc_split_info(self, X, y, feature_idx, is_continuous=False, split_val=None):
        """计算分裂信息(用于信息增益率)"""
        total = len(y)
        split_info = 0.0

        if is_continuous and split_val is not None:
            # 连续特征:按分割点计算
            (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                X, y, feature_idx, split_val, is_continuous=True
            )
            prob1 = len(sub_y1) / total
            prob2 = len(sub_y2) / total
            if prob1 > 0:
                split_info -= prob1 * log2(prob1)
            if prob2 > 0:
                split_info -= prob2 * log2(prob2)
        else:
            # 离散特征:按取值计算
            feature_values = X.iloc[:, feature_idx].unique()
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, feature_idx, value)
                prob = len(sub_y) / total
                if prob > 0:
                    split_info -= prob * log2(prob)

        return split_info

    def choose_best_feature(self, X, y, continuous_features=None):
        """选择信息增益率最大的特征(C4.5核心)"""
        if continuous_features is None:
            continuous_features = set()  # 记录连续特征的索引
        num_features = X.shape[1]
        best_gain_ratio = -float('inf')
        best_feature_idx = -1
        best_split_val = None  # 连续特征的最佳分割点

        for i in range(num_features):
            is_continuous = i in continuous_features
            # 计算信息增益
            info_gain, split_val = self.calc_info_gain(X, y, i, is_continuous)
            # 计算分裂信息(避免分母为0)
            split_info = self.calc_split_info(X, y, i, is_continuous, split_val)
            if split_info < 1e-10:  # 分裂信息接近0时跳过(避免除0)
                continue
            # 信息增益率 = 信息增益 / 分裂信息
            gain_ratio = info_gain / split_info

            # 更新最优特征
            if gain_ratio > best_gain_ratio:
                best_gain_ratio = gain_ratio
                best_feature_idx = i
                best_split_val = split_val

        return best_feature_idx, best_split_val, best_feature_idx in continuous_features

    def majority_vote(self, y):
        """多数投票法"""
        return y.value_counts().idxmax()

    def build_tree(self, X, y, feature_names, continuous_features=None):
        """递归构建决策树"""
        if continuous_features is None:
            continuous_features = set()

        # 终止条件1:所有样本标签相同
        if len(y.unique()) == 1:
            return y.iloc[0]

        # 终止条件2:无特征可划分,返回多数标签
        if X.empty:
            return self.majority_vote(y)

        # 选择最优特征(信息增益率最大)
        best_idx, best_split, is_continuous = self.choose_best_feature(
            X, y, continuous_features
        )
        best_name = feature_names[best_idx]

        # 处理连续特征(在特征名后添加分割点)
        if is_continuous:
            best_name = f"{best_name}<= {best_split:.2f}"

        tree = {best_name: {}}
        remaining_names = [f for i, f in enumerate(feature_names) if i != best_idx]
        remaining_continuous = {i for i in continuous_features if i != best_idx}
        # 调整连续特征索引(删除当前特征后索引减1)
        remaining_continuous = {i-1 if i > best_idx else i for i in remaining_continuous}

        if is_continuous:
            # 连续特征:分为<=split和>split两支
            (sub_X1, sub_y1), (sub_X2, sub_y2) = self.split_dataset(
                X, y, best_idx, best_split, is_continuous=True
            )
            # 递归构建子树
            tree[best_name]["是"] = self.build_tree(
                sub_X1, sub_y1, remaining_names, remaining_continuous
            )
            tree[best_name]["否"] = self.build_tree(
                sub_X2, sub_y2, remaining_names, remaining_continuous
            )
        else:
            # 离散特征:按每个取值构建子树
            feature_values = X.iloc[:, best_idx].unique()
            for value in feature_values:
                sub_X, sub_y = self.split_dataset(X, y, best_idx, value)
                tree[best_name][value] = self.build_tree(
                    sub_X, sub_y, remaining_names, remaining_continuous
                )

        return tree

    def fit(self, X, y, feature_names=None, continuous_features=None):
        """训练模型"""
        if feature_names is None:
            feature_names = list(X.columns)
        if continuous_features is None:
            continuous_features = set()  # 连续特征的列名集合
        # 转换连续特征列名为索引
        self.continuous_idxs = {feature_names.index(f) for f in continuous_features}
        self.feature_names = feature_names
        self.y_train = y
        self.tree = self.build_tree(X, y, feature_names, self.continuous_idxs)

    def predict_single(self, sample, tree=None):
        """预测单个样本"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            return tree

        # 解析当前节点(处理连续特征)
        current_node = next(iter(tree.keys()))
        if "<= " in current_node:
            # 连续特征:提取特征名和分割点
            feature_name, split_val = current_node.split("<= ")
            split_val = float(split_val)
            # 判断样本值是否满足条件
            sample_val = sample[feature_name]
            if sample_val <= split_val:
                return self.predict_single(sample, tree[current_node]["是"])
            else:
                return self.predict_single(sample, tree[current_node]["否"])
        else:
            # 离散特征:直接匹配取值
            feature_name = current_node
            sample_val = sample[feature_name]
            if sample_val in tree[current_node]:
                return self.predict_single(sample, tree[current_node][sample_val])
            else:
                return self.majority_vote(self.y_train)

    def predict(self, X):
        """预测多个样本"""
        return X.apply(self.predict_single, axis=1)

    def print_tree(self, tree=None, indent=""):
        """打印决策树(文本形式)"""
        if tree is None:
            tree = self.tree

        if not isinstance(tree, dict):
            print(f"{indent}└── 结果: {tree}")
            return

        node_name = next(iter(tree.keys()))
        print(f"{indent}┌── 特征: {node_name}")

        values = list(tree[node_name].keys())
        for i, value in enumerate(values):
            if i == len(values) - 1:
                print(f"{indent}│   └── 分支: {value}")
                self.print_tree(tree[node_name][value], indent + "       ")
            else:
                print(f"{indent}│   ├── 分支: {value}")
                self.print_tree(tree[node_name][value], indent + "│      ")


# 示例:感冒判断数据集测试(含连续特征)
if __name__ == "__main__":
    # 1. 构建数据集(添加连续特征"体温值")
    data = {
        "流鼻涕": ["是", "否", "是", "是", "否", "是", "是", "是", "否", "是", "是", "否", "否", "否", "否", "否"],
        "体温值": [37.5, 39.2, 39.5, 36.5, 36.3, 37.8, 37.6, 39.1, 37.7, 36.4, 36.6, 36.2, 37.9, 39.3, 39.4, 37.4],
        "肌肉疼": ["否", "否", "是", "是", "否", "是", "否", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "头疼": ["是", "否", "是", "否", "否", "是", "是", "是", "是", "否", "是", "否", "否", "是", "是", "是"],
        "感冒": ["是", "否", "是", "是", "否", "是", "是", "是", "是", "否", "是", "是", "否", "是", "是", "是"]
    }
    df = pd.DataFrame(data)
    X = df.drop("感冒", axis=1)
    y = df["感冒"]

    # 2. 定义连续特征(体温值为连续特征)
    continuous_features = {"体温值"}

    # 3. 训练C4.5决策树
    print("=== 训练C4.5决策树 ===")
    c45 = C45DecisionTree()
    c45.fit(X, y, continuous_features=continuous_features)

    # 4. 打印决策树结构
    print("\n=== 决策树结构(文本形式) ===")
    c45.print_tree()

    # 5. 预测测试数据
    test_data = pd.DataFrame({
        "流鼻涕": ["否"],
        "体温值": [36.2],
        "肌肉疼": ["否"],
        "头疼": ["否"]
    })
    print("\n=== 测试数据预测 ===")
    print("测试数据特征:")
    print(test_data.T)
    print(f"预测结果(是否感冒):{c45.predict(test_data).iloc[0]}")

运行结果

=== 训练C4.5决策树 ===

=== 决策树结构(文本形式) ===

┌── 特征: 头疼

│ ├── 分支: 是

│ └── 结果: 是

│ └── 分支: 否

┌── 特征: 体温值<= 36.25

│ ├── 分支: 是

│ └── 结果: 是

│ └── 分支: 否

┌── 特征: 肌肉疼

│ ├── 分支: 否

│ └── 结果: 否

│ └── 分支: 是

└── 结果: 是

=== 测试数据预测 ===

测试数据特征:

0

流鼻涕 否

体温值 36.2

肌肉疼 否

头疼 否

预测结果(是否感冒):是

Process finished with exit code 0

相关推荐
高工智能汽车9 分钟前
“融资热潮”来临!商用车自动驾驶拐点已至?
人工智能·机器学习·自动驾驶
PieroPc22 分钟前
用python Streamlit 做个RapidOCR 文本识别系统
开发语言·python·ocr
测试19981 小时前
压力测试详解
自动化测试·软件测试·python·测试工具·职场和发展·测试用例·压力测试
syker1 小时前
NEWBASIC 2.06.7 API 帮助与用户使用手册
开发语言·人工智能·机器学习·自动化
浔川python社1 小时前
《Python 小程序编写系列》(第一部):从零开始写一个猜数字游戏
python
mortimer1 小时前
使用阿里AI模型去除背景噪音:单文件40行代码实现
python·ffmpeg·阿里巴巴
程序员爱钓鱼1 小时前
Python编程实战——Python实用工具与库:Matplotlib数据可视化
前端·后端·python
数据超市1 小时前
快速CAD转到PPT的方法,带教程
大数据·python·科技·信息可视化·数据挖掘
程序员爱钓鱼1 小时前
Python编程实战 - Python实用工具与库 - requests 与 BeautifulSoup
前端·后端·python