赞
踩
目前玩机器学习的小伙伴,上来就是使用现有的sklearn机器学习包,写两行代码,调调参数就能跑起来,看似方便,实则有时不利于个人能力发展,要知道现在公司需要的算法工程师,不仅仅只是会调参(这种工作,入门几个月的人就可以干了),而是要深入底层,能优化代码,能自己搭。
本文章适合以下几类人:
1)初学者,了解机器学习的实现过程
2)想提升自己的代码能力
第一步:原理
决策树可以被简单的看成是一些if 和else,其优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关特征数据。其缺点:可能会产生过度匹配问题。决策树相关详细理论的博客,网上有很多,我就不重复啰嗦了,感兴趣的可以看下这个:十大经典算法之C4.5算法综述 - 知乎
第二步:代码实现
- #include <vector>
- #include <set>
- #include <map>
- #include <string>
- #include <fstream>
- #include <sstream>
- #include <iostream>
- #include <math.h>
-
- using namespace std;
-
-
- /*******树的构造*******/
- struct TreeNode {
- string m_sAttribute;//节点名字
- int m_iDeciNum; //yes 数
- int m_iUnDecinum; //no 数
- vector<TreeNode*> m_vChildren;
- };
-
-
- TreeNode* CreateTreeNode(string value)
- {
- TreeNode* pNode = new TreeNode();
- pNode->m_sAttribute = value;
- return pNode;
- }
-
- bool FindNode(TreeNode* pRoot, std::string& item)
- {
-
- if (pRoot->m_sAttribute == item)
- return true;
-
- bool found = false;
-
- vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
- while (!found && i < pRoot->m_vChildren.end())
- {
- found = FindNode(*i, item);
- ++i;
- }
-
- return found;
- }
-
- void ConnectTreeNodes(TreeNode* pParent, TreeNode* pChild)
- {
- if (pParent != NULL)
- {
- pParent->m_vChildren.push_back(pChild);
- }
- }
-
- void PrintTreeNode(TreeNode* pNode)
- {
- if (pNode != NULL)
- {
- printf("value of this node is: %d.\n", pNode->m_sAttribute);
- printf("its children is as the following:\n");
- std::vector<TreeNode*>::iterator i = pNode->m_vChildren.begin();
- while (i < pNode->m_vChildren.end())
- {
- if (*i != NULL)
- printf("%s\t", (*i)->m_sAttribute);
- ++i;
- }
- printf("\n");
- }
- else
- {
- printf("this node is null.\n");
- }
-
- printf("\n");
- }
-
- void PrintTree(TreeNode* pRoot)
- {
- PrintTreeNode(pRoot);
-
- if (pRoot != NULL)
- {
- std::vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
- while (i < pRoot->m_vChildren.end())
- {
- PrintTree(*i);
- ++i;
- }
- }
- }
-
- void DestroyTree(TreeNode* pRoot)
- {
- if (pRoot != NULL)
- {
- std::vector<TreeNode*>::iterator i = pRoot->m_vChildren.begin();
- while (i < pRoot->m_vChildren.end())
- {
- DestroyTree(*i);
- ++i;
- }
- delete pRoot;
- }
- }
- /*******树的构造*******/
-
-
- class DecisionTree {
- private:
-
- struct attrItem
- {
- std::vector<int> itemNum; //itemNum[0] = itemLine.size()
- //itemNum[1] = decision num
- set<int> itemLine; //可用行
- };
- //重点
- struct attributes
- {
- string attriName; //属性名字
- vector<double> statResult;
- map<string, attrItem*> attriItem;//存放子目录的信息
- };
-
- vector<attributes*> statTree;
- int attriNum;
- vector<vector<string>> infos;
- map<string, int> attr_clum;//作用不大
-
- public:
- DecisionTree() {
- attriNum = 0;
- }
- vector<vector<string>>& getInfos()
- {
- return infos;
- }
- vector<attributes*>& getStatTree()
- {
- return statTree;
- }
- int pretreatment(string filename, set<int>& readLineNum, vector<int>& readClumNum);
- int statister(vector<vector<string>>& infos, vector<attributes*>& statTree,
- set<int>& readLine, vector<int>& readClumNum);
- int compuDecisiNote(vector<attributes*>& statTree, int deciNum, int lineNum, vector<int>& readClumNum);
- double info_D(int deciNum, int sum);
- void resetStatTree(vector<attributes*>& statTree, vector<int>& readClumNum);
- double Info_attr(map<string, attrItem*>& attriItem, double& splitInfo, int lineNum);
- void CreatTree(TreeNode* &treeHead, vector<attributes*>& statTree, vector<vector<string>>& infos,
- set<int>& readLine, vector<int>& readClumNum, int deep);
- };
-
-
- /*
- * @function CreatTree 预处理函数,负责读入数据,并生成信息矩阵和属性标记
- * @param: filename 文件名
- * @param: readLineNum 可使用行set
- * @param: readClumNum 可用属性vector 0可用 1不可用
- * @return int 返回函数执行状态
- */
-
- int DecisionTree::pretreatment(string filename, set<int>& readLineNum, vector<int>& readClumNum)
- {
-
- }
-
- /*
- * @function Info_attr info_D 总信息量
- * @param: deciNum 有效信息数
- * @param: sum 总信息量
- * @return double 总信息量比例
- */
- double DecisionTree::info_D(int deciNum, int sum)
- {
- double pi = (double)deciNum / (double)sum;
- double result = 0.0;
- if ((1.0 - pi) < 0.000001 || (pi - 0.0) < 0.000001)
- {
- return result;
- }
- result = pi * (log(pi) / log((double)2)) + (1 - pi)*(log(1 - pi) / log((double)2));
- return -result;
- }
-
- /*
- * @function Info_attr 总信息量
- * @param: deciNum 有效信息数
- * @param: sum 总信息量
- * @return double
- */
- double DecisionTree::Info_attr(map<string, attrItem*>& attriItem, double& splitInfo, int lineNum)
- {
- double result = 0.0;
- for (map<string, attrItem*>::iterator item = attriItem.begin();
- item != attriItem.end();
- ++item
- )
- {
- double pi = (double)(item->second->itemNum[0]) / (double)lineNum;
- splitInfo += pi * (log(pi) / log((double)2));
- double sub_attr = info_D(item->second->itemNum[1], item->second->itemNum[0]);
- result += pi * sub_attr;
- }
- splitInfo = -splitInfo;
- return result;
- }
-
- /*
- * @function compuDecisiNote 计算C4.5
- * @param: statTree 为状态树,此树动态更新,但是由于是DFS对数据更新,所以不必每次新建状态树
- * @param: deciNum Yes的数据量
- * @param: lineNum 计算set的行数
- * @param: readClumNum 用于计算的set
- * @return int 信息量最大的属性号
- */
- int DecisionTree::compuDecisiNote(vector<attributes*>& statTree, int deciNum, int lineNum, vector<int>& readClumNum)
- {
- double max_temp = 0;
- int max_attribute = 0;
- //总的yes行的信息量
- double infoD = info_D(deciNum, lineNum);
- for (int i = 0; i < attriNum - 1; i++)
- {
- if (readClumNum[i] == 0)
- {
- double splitInfo = 0.0;
- //info
- double info_temp = Info_attr(statTree[i]->attriItem, splitInfo, lineNum);
- statTree[i]->statResult.push_back(info_temp);
- //gain
- double gain_temp = infoD - info_temp;
- statTree[i]->statResult.push_back(gain_temp);
- //split_info
- statTree[i]->statResult.push_back(splitInfo);
- //gain_info
- double temp = gain_temp / splitInfo;
- statTree[i]->statResult.push_back(temp);
- //得到最大值*/
- if (temp > max_temp)
- {
- max_temp = temp;
- max_attribute = i;
- }
- }
- }
- return max_attribute;
- }
-
- /*
- * @function resetStatTree 清理状态树
- * @param: statTree 状态树
- * @param: readClumNum 需要清理的属性set
- * @return void
- */
-
- void DecisionTree::resetStatTree(vector<attributes*>& statTree, vector<int>& readClumNum)
- {
- for (int i = 0; i < readClumNum.size() - 1; i++)
- {
- if (readClumNum[i] == 0)
- {
- map<string, attrItem*>::iterator it_end = statTree[i]->attriItem.end();
- for (map<string, attrItem*>::iterator it = statTree[i]->attriItem.begin();
- it != it_end; it++)
- {
- delete it->second;
- }
- statTree[i]->attriItem.clear();
- statTree[i]->statResult.clear();
- }
- }
- }
-
-
- int main(int argc, char* argv[]) {
- string filename = "tree.txt";
- DecisionTree dt;
- int attr_node = 0;
- TreeNode* treeHead = nullptr;
- set<int> readLineNum;
- vector<int> readClumNum;
- int deep = 0;
- if (dt.pretreatment(filename, readLineNum, readClumNum) == 0)
- {
- dt.CreatTree(treeHead, dt.getStatTree(), dt.getInfos(), readLineNum, readClumNum, deep);
- }
- return 0;
- }
第三步:运行过程
运行结果
用到的软件是vs2010以上的版本都可以,不用额外配置什么,没调包,会用这个软件进行c++开发,就会使用这个软件
此程序由于不调用任何外源库,所以读者可以看清楚每一个算法的原理,要想学好机器学习算法,必须打好基础,不要好高骛远,另外,程序都是有备注,应该很好理解的,实在不懂,可以来问店主
代码的下载路径(新窗口打开链接):机器学习算法决策树C4.5之c++实现(不调用外源库)
有问题可以私信或者留言,有问必答
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。