决策树模型python代码实现

计算熵函数(二元熵)

python 复制代码
# UNQ_C1
# GRADED FUNCTION: compute_entropy

def compute_entropy(y):
    """
    Computes the entropy for 
    
    Args:
       y (ndarray): Numpy array indicating whether each example at a node is
           edible (`1`) or poisonous (`0`)
       
    Returns:
        entropy (float): Entropy at that node
        
    """
    # You need to return the following variables correctly
    entropy = 0
    
    ### START CODE HERE ###
    p=len(y[y==1])/len(y)
    if pos==size or pos==0:
        entropy=0
    else:
        entropy=-p*np.log2(p)-(1-p)*np.log2(1-p)
    ### END CODE HERE ###        
    
    return entropy

根据特征分裂结点

python 复制代码
# UNQ_C2
# GRADED FUNCTION: split_dataset

def split_dataset(X, node_indices, feature):
    """
    Splits the data at the given node into
    left and right branches
    
    Args:
        X (ndarray):             Data matrix of shape(n_samples, n_features)
        node_indices (ndarray):  List containing the active indices. I.e, the samples being considered at this step.
        feature (int):           Index of feature to split on
    
    Returns:
        left_indices (ndarray): Indices with feature value == 1
        right_indices (ndarray): Indices with feature value == 0
    """
    
    # You need to return the following variables correctly
    left_indices = []
    right_indices = []
    
    ### START CODE HERE ###
    for i in node_indices:
        if X[i,feature]==1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    ### END CODE HERE ###
        
    return left_indices, right_indices

计算信息增益

python 复制代码
# UNQ_C3
# GRADED FUNCTION: compute_information_gain

def compute_information_gain(X, y, node_indices, feature):
    
    """
    Compute the information of splitting the node on a given feature
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
   
    Returns:
        cost (float):        Cost computed
    
    """    
    # Split dataset
    left_indices, right_indices = split_dataset(X, node_indices, feature)
    
    # Some useful variables
    X_node, y_node = X[node_indices], y[node_indices]
    X_left, y_left = X[left_indices], y[left_indices]
    X_right, y_right = X[right_indices], y[right_indices]
    
    # You need to return the following variables correctly
    information_gain = 0
    
    ### START CODE HERE ###
    
    # Weights 
    lefts=len(y_left)
    rights=len(y_right)
    w_left=lefts/(lefts+rights)
    w_right=rights/(lefts+rights)
    #Weighted entropy
    H_left=compute_entropy(y_left)
    H_right=compute_entropy(y_right)
    #Information gain                                                   
    H_root=compute_entropy(y_node)
    information_gain=H_root-(w_left*H_left+w_right*H_right)
    ### END CODE HERE ###  
    
    return information_gain

选择最佳分裂特征

python 复制代码
# UNQ_C4
# GRADED FUNCTION: get_best_split

def get_best_split(X, y, node_indices):   
    """
    Returns the optimal feature and threshold value
    to split the node data 
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.

    Returns:
        best_feature (int):     The index of the best feature to split
    """    
    
    # Some useful variables
    num_features = X.shape[1]
    
    # You need to return the following variables correctly
    best_feature = -1

    IG=0
    ### START CODE HERE ###
    for i in range(num_features):
        ig=compute_information_gain(X,y,node_indices,i)
        if ig>IG:
            IG=ig
            best_feature=i
    ### END CODE HERE ##    
   
    return best_feature

随机森林算法

python 复制代码
def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
    """
    Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
    This function just prints the tree.
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
        branch_name (string):   Name of the branch. ['Root', 'Left', 'Right']
        max_depth (int):        Max depth of the resulting tree. 
        current_depth (int):    Current depth. Parameter used during recursive call.
   
    """ 

    # Maximum depth reached - stop splitting
    if current_depth == max_depth:
        formatting = " "*current_depth + "-"*current_depth
        print(formatting, "%s leaf node with indices" % branch_name, node_indices)
        return
   
    # Otherwise, get best split and split the data
    # Get the best feature and threshold at this node
    best_feature = get_best_split(X, y, node_indices) 
    tree.append((current_depth, branch_name, best_feature, node_indices))
    
    formatting = "-"*current_depth
    print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
    
    # Split the dataset at the best feature
    left_indices, right_indices = split_dataset(X, node_indices, best_feature)
    
    # continue splitting the left and the right child. Increment current depth
    build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
    build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)
相关推荐
Ulyanov27 分钟前
《PySide6 GUI开发指南:QML核心与实践》 第二篇:QML语法精要——构建声明式UI的基础
java·开发语言·javascript·python·ui·gui·雷达电子对抗系统仿真
简简单单做算法1 小时前
基于GA遗传优化双BP神经网络的时间序列预测算法matlab仿真
神经网络·算法·matlab·时间序列预测·双bp神经网络
刀法如飞1 小时前
一款Python语言Django框架DDD脚手架,助你快速搭建项目
python·ddd·脚手架
刀法如飞1 小时前
一款Python语言Django框架DDD脚手架,适合中大型项目
后端·python·领域驱动设计
guygg881 小时前
利用遗传算法解决列车优化运行问题的MATLAB实现
开发语言·算法·matlab
武藤一雄1 小时前
19个核心算法(C#版)
数据结构·windows·算法·c#·排序算法·.net·.netcore
MediaTea1 小时前
Scikit-learn:数据集
人工智能·python·机器学习·scikit-learn
sali-tec1 小时前
C# 基于OpenCv的视觉工作流-章52-交点查找
图像处理·人工智能·opencv·算法·计算机视觉
yu85939582 小时前
MATLAB连续线性化模型预测控制(SL-MPC)
算法·机器学习·matlab
ytttr8732 小时前
基于ACADO工具包的自主车道跟踪与避障MPC控制
算法