R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)

转载自：http://blog.csdn.net/hawksoft/article/details/7760868

1、辅助类，用于计算过程和结果存储

[csharp] view plaincopyprint?

/// <summary>
/// 决策树节点.
/// </summary>
public class DecisionTreeNode
{
/// <summary>
/// 类型：分支或叶子
/// </summary>
public string Type { get; set; }
/// <summary>
/// 关键字一般存当前属性因子
/// </summary>
public string Key { get; set; }
/// <summary>
/// 判断值，叶子节点有效.
/// </summary>
public string DecisionValue { get; set; }
/// <summary>
/// 前一个属性因子，可以看作是分支条件.
/// </summary>
public string ParentFactor { get; set; }
/// <summary>
/// 当前节点的样本数量，
/// </summary>
public int CalcCount { get; set; }
/// <summary>
/// 当前节点的样本索引集合.
/// </summary>
public List<int> DataIndexes {get;set;}
/// <summary>
/// 分支节点集合.
/// </summary>
public Dictionary<string, DecisionTreeNode> Children { get; private set; }
/// <summary>
/// 父节点
/// </summary>
public DecisionTreeNode Parent { get; set; }
public DecisionTreeNode()
{
DataIndexes = new List<int>();
Children = new Dictionary<string, DecisionTreeNode>();
}
}
/// <summary>
/// 用于计算过程存放数据.用数组不是很方便，这里采用字典，可以减少循环次数.
/// </summary>
public class CalcNode
{
public string Key { get; set; }
public string Type { get; set; }
public int CalcCount { get; set; }
public List<int> DataIndexes {get;set;}
public Dictionary<string, CalcNode> Children { get; private set; }
public CalcNode()
{
DataIndexes = new List<int>();
Children = new Dictionary<string, CalcNode>();
}
public void AddChildren(string Key,string AType,int AIndex, int Count = 1)
{
if (Children.ContainsKey(Key) == false)
{
Children.Add(Key, new CalcNode());
}
Children[Key].Key = Key;
Children[Key].Type = AType;
Children[Key].CalcCount += Count;
Children[Key].DataIndexes.Add(AIndex);
}
}

2、算法类，注释比较详细，有时间再写一篇原理文章

[csharp] view plaincopyprint?

/// <summary>
/// 决策树算法类，不适合连续性值。
/// </summary>
public class DecisionTreeAlg
{
private string PrefixString = " ";
/// <summary>
/// 构建决策树，决策分类属性约定放在第1列。
/// </summary>
/// <param name="Inputs">行表示属性，列为值，注意列等长</param>
/// <param name="PNode">父节点</param>
/// <param name="PropertyNames">测试属性名称</param>
/// <param name="TestProperties">当前可用测试属性索引</param>
/// <param name="DefaultClassFactor">缺省判别决策分类因子</param>
/// <param name="CallLevel">用来测试输出控制，无实际作用</param>
/// <param name="OutContents">输出内容，为调试用</param>
/// <param name="PropertyFactors">属性因子</param>
public void BuildDecisionTree(int CallLevel, ref string OutContents, string[][] Inputs, DecisionTreeNode PNode, string[] PropertyNames, List<int> TestProperties, string DefaultClassFactor, Dictionary<string, List<string>> PropertyFactors)
{
string thePrefix = PrefixString.Substring(0, CallLevel * 2);
CallLevel++;
//如果没有测试属性，将当前节点设为叶子节点，选择高概率分类，然后返回
if (TestProperties.Count <= 1)
{
PNode.Type = "叶子";
PNode.DecisionValue = DefaultClassFactor;
return;
}
//如果没有学习样本集，将当前节点设为叶子节点，选择高概率分类，然后返回
if (PNode.DataIndexes.Count <= 0)
{
PNode.Type = "叶子";
PNode.DecisionValue = DefaultClassFactor;
return;
}
if (PropertyFactors == null)
{
PropertyFactors = new Dictionary<string, List<string>>();
}
//准备存储遍历时的计数存储结构
Dictionary<string, CalcNode> thePropertyCount = new Dictionary<string, CalcNode>();
foreach (var theProIndex in TestProperties)
{
thePropertyCount.Add(PropertyNames[theProIndex], new CalcNode() { Key = PropertyNames[theProIndex] });
if (PropertyFactors.ContainsKey(PropertyNames[theProIndex]) == false)
{
PropertyFactors.Add(PropertyNames[theProIndex], new List<string>());
}
}
//遍历当前可遍历的数据，进行统计，为计算各属性熵做准备
for (int n = 0; n < PNode.DataIndexes.Count; n++)
{
int theI = PNode.DataIndexes[n];
for (int k = 0; k < TestProperties.Count; k++)
{
int theJ = TestProperties[k];
var thePropertyCalcNode = thePropertyCount[PropertyNames[theJ]];
//对当前属性计数
thePropertyCalcNode.CalcCount++;
//对第j个属性的当前因子计数
thePropertyCalcNode.AddChildren(Inputs[theJ][theI], "测试属性因子", theI, 1);
//对第j个属性的当前因子的主分类因子计数
thePropertyCalcNode.Children[Inputs[theJ][theI]].AddChildren(Inputs[0][theI], "主分类因子", theI, 1);
//统计归纳各属性因子，采用这种方式可以减少循环.
if (PropertyFactors[PropertyNames[theJ]].Contains(Inputs[theJ][theI]) == false)
{
PropertyFactors[PropertyNames[theJ]].Add(Inputs[theJ][theI]);
}
}
}
//计算信息增益量,获取具有最大信息增益属性
string theDefaultClassFactor = DefaultClassFactor;
//初始化最大测试属性熵值.
double theMaxEA = double.MinValue;
//记录具有最大熵值属性的索引位置
int theMaxPropertyIndex = TestProperties[1];
//总信息熵值，其实就是分类属性的熵值.
double theTotalEA = 0.0;
//记录总的样本数，用于估算概率.
double theTotalSimple = 0;
for(int theI=0;theI<TestProperties.Count;theI++)
{
int thePIndex_1 = TestProperties[theI];
if (thePIndex_1 == 0)
{
//主分类熵值计算，计算公式与测试属性有所不同.
CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
double theCount = theCalcNode.CalcCount;
theTotalSimple = theCount;
double theMaxSubCount = -1;
theTotalEA = 0.0;
//求和(-Pj*log2(Pj))
foreach (var theSubNode in theCalcNode.Children)
{
if (theSubNode.Value.CalcCount > 0)
{
double thePj = theSubNode.Value.CalcCount / theCount;
theTotalEA += 0 - thePj * Math.Log(thePj, 2);
}
if (theMaxSubCount < theSubNode.Value.CalcCount)
{
theMaxSubCount = theSubNode.Value.CalcCount;
theDefaultClassFactor = theSubNode.Key;
}
//测试输出，跟踪计算路径.
OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + ":: " + theSubNode.Value.Type + " :: " + theSubNode.Key + " :: " + theSubNode.Value.CalcCount;
}
}
else
{
//测试属性熵值计算。
CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
double theJEA = 0.0;
foreach (var theSubNode_1 in theCalcNode.Children)
{
if (theSubNode_1.Value.CalcCount > 0)
{
double theSjCount = theSubNode_1.Value.CalcCount;
double theSj_1 = theSjCount / theTotalSimple;
double theSj_2 = 0.0;
foreach (var theSubNode_2 in theSubNode_1.Value.Children)
{
if (theSubNode_2.Value.CalcCount > 0)
{
double thePj_1 = Convert.ToDouble(theSubNode_2.Value.CalcCount) / theSjCount;
theSj_2 += 0.0 - thePj_1 * Math.Log(thePj_1, 2);
}
OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + " :: " + theSubNode_1.Value.Type + " :: " + theSubNode_1.Key + " :: " + theSubNode_1.Value.CalcCount
+ theSubNode_2.Value.Type + " :: " + theSubNode_2.Key + " :: " + theSubNode_2.Value.CalcCount;
}
theJEA += theSj_1 * theSj_2;
}
}
theJEA = theTotalEA - theJEA;
//只记录最大熵值属性信息.
if (theMaxEA < theJEA)
{
theMaxEA = theJEA;
theMaxPropertyIndex = thePIndex_1;
}
}
}
//如果分类因子只有一个，则置当前节点为叶子节点，设置判定为当前分类因子，然后返回
if (thePropertyCount[PropertyNames[0]].Children.Count <= 1)
{
PNode.Type = "叶子";
PNode.DecisionValue = theDefaultClassFactor;
return;
}
//具有多个分类因子，还剩有测试属性，则设当前节点为分支节点，准备分支.
PNode.Type = "分支";
//1选取最大增益信息量测试属性，做分支处理,做处理,注意属性一旦处理，将不在后续节点中再处理
//因此需要在测试属性集合中删除所选测试属性.注意保持分类属性在开始索引处(0).
PNode.Key = PropertyNames[theMaxPropertyIndex];
CalcNode theCalcNode_2 = thePropertyCount[PropertyNames[theMaxPropertyIndex]];
List<string> theFactors = PropertyFactors[PropertyNames[theMaxPropertyIndex]];
List<int> theAvailableTestPs = new List<int>();
for (int i = 0; i < TestProperties.Count; i++)
{
if (theMaxPropertyIndex != TestProperties[i])
{
theAvailableTestPs.Add(TestProperties[i]);
}
}
//对所选测试属性的所有因子进行处理.
foreach (var theFactor_1 in theFactors)
{
//如果当前因子不在计算中，则添加一个叶子节点，判定为高概率分类。
if (theCalcNode_2.Children.ContainsKey(theFactor_1) == false)
{
DecisionTreeNode theNode_1 = new DecisionTreeNode();
theNode_1.ParentFactor = theFactor_1;
theNode_1.CalcCount = 0;
theNode_1.DecisionValue = theDefaultClassFactor;
theNode_1.Parent = PNode;
theNode_1.Key = theFactor_1;
theNode_1.Type = "叶子";
PNode.Children.Add(theFactor_1, theNode_1);
continue;
}
//如果当前因子存在，但不存在样本，则添加一个叶子节点，判定为高概率分类。
if (theCalcNode_2.Children[theFactor_1].CalcCount<=0)
{
DecisionTreeNode theNode_1 = new DecisionTreeNode();
theNode_1.ParentFactor = theFactor_1;
theNode_1.CalcCount = 0;
theNode_1.DecisionValue = theDefaultClassFactor;
theNode_1.Parent = PNode;
theNode_1.Type = "叶子";
theNode_1.Key = theFactor_1;
PNode.Children.Add(theFactor_1, theNode_1);
continue;
}
//如果存在，且有学习样本，则添加一个节点，并以此节点递归处理.
DecisionTreeNode theNode_2 = new DecisionTreeNode();
theNode_2.ParentFactor = theFactor_1;
theNode_2.Parent = PNode;
theNode_2.Key = theFactor_1;
theNode_2.CalcCount = theCalcNode_2.Children[theFactor_1].CalcCount;
theNode_2.DataIndexes.AddRange(theCalcNode_2.Children[theFactor_1].DataIndexes);
PNode.Children.Add(theFactor_1, theNode_2);
BuildDecisionTree(CallLevel, ref OutContents, Inputs, theNode_2, PropertyNames, theAvailableTestPs, theDefaultClassFactor, PropertyFactors);
}
}
}

3、测试代码：

[csharp] view plaincopyprint?

private void button1_Click(object sender, EventArgs e)
{
DecisionTreeAlg theAlg = new DecisionTreeAlg();
string[][] theInputs = new string[4][];
theInputs[0] = new string[] { "no", "yes", "yes", "yes", "yes", "yes", "no", "yes", "yes", "no" };
theInputs[1] = new string[] { "s", "s", "l", "m", "l", "m", "m", "l", "m", "s" };
theInputs[2] = new string[] { "s", "l", "m", "m", "m", "l", "s", "m", "s", "s" };
theInputs[3] = new string[] { "no", "yes", "yes", "yes", "no", "no", "no", "no", "no", "yes" };
string[] thePropertyName = new string[] {"是否真实帐号","日志密度","好友密度","是否真实头像" };
DecisionTreeNode theRootNode = new DecisionTreeNode();
theRootNode.DataIndexes.AddRange(new List<int>() { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
List<int> theTestPs = new List<int>() { 0, 1, 2, 3 };
string theOuts = "";
theAlg.BuildDecisionTree(0,ref theOuts, theInputs, theRootNode, thePropertyName, theTestPs, "", null);
this.treeView1.Nodes.Clear();
TreeNode theRoot = new TreeNode();
this.treeView1.Nodes.Add(theRoot);
VisitTree(theRoot, theRootNode);
this.textBox1.Text = theOuts;
}
private void VisitTree(TreeNode PNode, DecisionTreeNode PDNode)
{
PNode.Text = PDNode.Key + "(" + PDNode.Type + ")[判定："+PDNode.DecisionValue +"]";
foreach (var theNode in PDNode.Children.Values)
{
TreeNode theTmpNode = new TreeNode();
PNode.Nodes.Add(theTmpNode);
VisitTree(theTmpNode, theNode);
}
}

R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)相关推荐

R语言学习系列之本地数据获取
R语言学习系列之本地数据获取任何数据分析工作之前,都得把数据先读取进来你才能进行后续的分析工作.所以本文简要介绍在R中如何对本地文件进行获取,希望可以给刚刚接触R语言的同学一点启发. 一.控制台的输 ...
R语言学习系列之向量化计算
##R语言学习系列之向量化计算本文主要讲解R语言向量化计算的原理及方法,希望对初学者能够提供帮助. ##一.向量化什么是向量化计算呢?其实你可以简单的理解成这样:当我们在使用函数或者定义函数的时候 ...
R语言学习系列教程及高级绘图工具使用
R语言系列教程引言:每天学点R语言 R语言的安装 Rstudio的安装和运行 R语言常用语法代码示例 R语言数据导入测试代码及数据 R语言数据操作示例及数据 R语言数据操作进阶及控制结构 R语言中处 ...
R语言学习路径和感受
第一次接触R语言是我读研的时候,算到现在有5年多了.R语言可以算得上是我进入编程世界的启蒙语言,尽管在大学期间为了考试而被迫学习过计算机二级,但那真心是没有一丁点的兴趣可言.进入R的世界后,真的越来越 ...
R语言学习笔记（1~3）
R语言学习笔记(1~3) 一.R语言介绍 x <- rnorm(5) 创建了一个名为x的向量对象,它包含5个来自标准正态分布的随机偏差. 1.1 注释由符号#开头. #函数c()以向量的形式输 ...
R语言学习手记 (1)
R语言学习手记 (1) 经管的会计和财管都会学数据统计与分析R语言这门课,加上我也有点兴趣,就提前选了这门课,以下的笔记由老师上课的PPT.<R语言编程艺术>和<R语言数据科学> ...
R语言学习笔记——入门篇：第一章-R语言介绍
R语言 R语言学习笔记--入门篇:第一章-R语言介绍文章目录 R语言一.R语言简介 1.1.R语言的应用方向 1.2.R语言的特点二.R软件的安装 2.1.Windows/Mac 2.2.Lin ...
Ruby语言学习系列--基本的ruby语法
Ruby语言学习系列--基本的ruby语法 1. 基本的ruby语法 1.1 变量.常量和类型 1) 定义变量变量类型描述示例局部变量(或伪变量) 以小写字母或下划 ...
2021年R语言学习路线以及资源推荐
R语言学习从入门到熟练掌握资源推荐最近有些小伙伴在后台问我要一些资源,呃,前段时间一直没看到消息,现在统一整理一下,由于微信不让发这个,我就放在公众号里了,在公众号里回复[R语言学习资源]即可,如果 ...

R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)

R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)相关推荐

最新文章

热门文章