目前玩机器学习的小伙伴,上来就是使用现有的sklearn机器学习包,写两行代码,调调参数就能跑起来,看似方便,实则有时不利于个人能力发展,要知道现在公司需要的算法工程师,不仅仅只是会调参(这种工作,入门几个月的人就可以干了),而是要深入底层,能优化代码,能自己搭。
本文章适合以下几类人:
1)初学者,了解机器学习的实现过程
2)想提升自己的代码能力
第一步:原理
决策树可以被简单的看成是一些if 和else,其优点 :计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关特征数据。其缺点 :可能会产生过度匹配问题。决策树相关详细理论的博客,网上有很多,我就不重复啰嗦了
第二步:代码实现
cpp
#include <vector>
#include <set>
#include <map>
#include <string>
#include <fstream>
#include <sstream>
#include <iostream>
#include <math.h>
using namespace std;
/*******树的构造*******/
struct TreeNode {
string m_sAttribute;//节点名字
int m_iDeciNum; //yes 数
int m_iUnDecinum; //no 数
vector<TreeNode*> m_vChildren;
};
TreeNode* CreateTreeNode(string value)
{
TreeNode* pNode = new TreeNode();
pNode->m_sAttribute = value;
return pNode;
}
bool FindNode(TreeNode* pRoot, std::string& item)
{
if (pRoot->m_sAttribute == item)
return true;
bool found = false;
vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
while (!found && i < pRoot->m_vChildren.end())
{
found = FindNode(*i, item);
++i;
}
return found;
}
void ConnectTreeNodes(TreeNode* pParent, TreeNode* pChild)
{
if (pParent != NULL)
{
pParent->m_vChildren.push_back(pChild);
}
}
void PrintTreeNode(TreeNode* pNode)
{
if (pNode != NULL)
{
printf("value of this node is: %d.\n", pNode->m_sAttribute);
printf("its children is as the following:\n");
std::vector<TreeNode*>::iterator i = pNode->m_vChildren.begin();
while (i < pNode->m_vChildren.end())
{
if (*i != NULL)
printf("%s\t", (*i)->m_sAttribute);
++i;
}
printf("\n");
}
else
{
printf("this node is null.\n");
}
printf("\n");
}
void PrintTree(TreeNode* pRoot)
{
PrintTreeNode(pRoot);
if (pRoot != NULL)
{
std::vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
while (i < pRoot->m_vChildren.end())
{
PrintTree(*i);
++i;
}
}
}
void DestroyTree(TreeNode* pRoot)
{
if (pRoot != NULL)
{
std::vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
while (i < pRoot->m_vChildren.end())
{
DestroyTree(*i);
++i;
}
delete pRoot;
}
}
/*******树的构造*******/
class DecisionTree {
private:
struct attrItem
{
std::vector<int> itemNum; //itemNum[0] = itemLine.size()
//itemNum[1] = decision num
set<int> itemLine; //可用行
};
//重点
struct attributes
{
string attriName; //属性名字
vector<double> statResult;
map<string, attrItem*> attriItem;//存放子目录的信息
};
vector<attributes*> statTree;
int attriNum;
vector<vector<string>> infos;
map<string, int> attr_clum;//作用不大
public:
DecisionTree() {
attriNum = 0;
}
vector<vector<string>>& getInfos()
{
return infos;
}
vector<attributes*>& getStatTree()
{
return statTree;
}
int pretreatment(string filename, set<int>& readLineNum, vector<int>& readClumNum);
int statister(vector<vector<string>>& infos, vector<attributes*>& statTree,
set<int>& readLine, vector<int>& readClumNum);
int compuDecisiNote(vector<attributes*>& statTree, int deciNum, int lineNum, vector<int>& readClumNum);
double info_D(int deciNum, int sum);
void resetStatTree(vector<attributes*>& statTree, vector<int>& readClumNum);
double Info_attr(map<string, attrItem*>& attriItem, double& splitInfo, int lineNum);
void CreatTree(TreeNode* &treeHead, vector<attributes*>& statTree, vector<vector<string>>& infos,
set<int>& readLine, vector<int>& readClumNum, int deep);
};
/*
* @function CreatTree 预处理函数,负责读入数据,并生成信息矩阵和属性标记
* @param: filename 文件名
* @param: readLineNum 可使用行set
* @param: readClumNum 可用属性vector 0可用 1不可用
* @return int 返回函数执行状态
*/
int DecisionTree::pretreatment(string filename, set<int>& readLineNum, vector<int>& readClumNum)
{
}
/*
* @function Info_attr info_D 总信息量
* @param: deciNum 有效信息数
* @param: sum 总信息量
* @return double 总信息量比例
*/
double DecisionTree::info_D(int deciNum, int sum)
{
double pi = (double)deciNum / (double)sum;
double result = 0.0;
if ((1.0 - pi) < 0.000001 || (pi - 0.0) < 0.000001)
{
return result;
}
result = pi * (log(pi) / log((double)2)) + (1 - pi)*(log(1 - pi) / log((double)2));
return -result;
}
/*
* @function Info_attr 总信息量
* @param: deciNum 有效信息数
* @param: sum 总信息量
* @return double
*/
double DecisionTree::Info_attr(map<string, attrItem*>& attriItem, double& splitInfo, int lineNum)
{
double result = 0.0;
for (map<string, attrItem*>::iterator item = attriItem.begin();
item != attriItem.end();
++item
)
{
double pi = (double)(item->second->itemNum[0]) / (double)lineNum;
splitInfo += pi * (log(pi) / log((double)2));
double sub_attr = info_D(item->second->itemNum[1], item->second->itemNum[0]);
result += pi * sub_attr;
}
splitInfo = -splitInfo;
return result;
}
/*
* @function compuDecisiNote 计算C4.5
* @param: statTree 为状态树,此树动态更新,但是由于是DFS对数据更新,所以不必每次新建状态树
* @param: deciNum Yes的数据量
* @param: lineNum 计算set的行数
* @param: readClumNum 用于计算的set
* @return int 信息量最大的属性号
*/
int DecisionTree::compuDecisiNote(vector<attributes*>& statTree, int deciNum, int lineNum, vector<int>& readClumNum)
{
double max_temp = 0;
int max_attribute = 0;
//总的yes行的信息量
double infoD = info_D(deciNum, lineNum);
for (int i = 0; i < attriNum - 1; i++)
{
if (readClumNum[i] == 0)
{
double splitInfo = 0.0;
//info
double info_temp = Info_attr(statTree[i]->attriItem, splitInfo, lineNum);
statTree[i]->statResult.push_back(info_temp);
//gain
double gain_temp = infoD - info_temp;
statTree[i]->statResult.push_back(gain_temp);
//split_info
statTree[i]->statResult.push_back(splitInfo);
//gain_info
double temp = gain_temp / splitInfo;
statTree[i]->statResult.push_back(temp);
//得到最大值*/
if (temp > max_temp)
{
max_temp = temp;
max_attribute = i;
}
}
}
return max_attribute;
}
/*
* @function resetStatTree 清理状态树
* @param: statTree 状态树
* @param: readClumNum 需要清理的属性set
* @return void
*/
void DecisionTree::resetStatTree(vector<attributes*>& statTree, vector<int>& readClumNum)
{
for (int i = 0; i < readClumNum.size() - 1; i++)
{
if (readClumNum[i] == 0)
{
map<string, attrItem*>::iterator it_end = statTree[i]->attriItem.end();
for (map<string, attrItem*>::iterator it = statTree[i]->attriItem.begin();
it != it_end; it++)
{
delete it->second;
}
statTree[i]->attriItem.clear();
statTree[i]->statResult.clear();
}
}
}
int main(int argc, char* argv[]) {
string filename = "tree.txt";
DecisionTree dt;
int attr_node = 0;
TreeNode* treeHead = nullptr;
set<int> readLineNum;
vector<int> readClumNum;
int deep = 0;
if (dt.pretreatment(filename, readLineNum, readClumNum) == 0)
{
dt.CreatTree(treeHead, dt.getStatTree(), dt.getInfos(), readLineNum, readClumNum, deep);
}
return 0;
}
第三步:运行过程
IDE编译软件用vs2010以上版本,运行结果
第四步:项目源码下载:
整套算法系列:深度学习与机器学习_AI洲抿嘴的薯片的博客-CSDN博客
项目源码下载地址:关注文末【AI街潜水的八角】,回复【决策树机器学习算法】即可下载
整套项目源码内容包含
程序里面包括决策树C4.5机器学习算法,接近上千行代码