本文作者:合肥工业大学 管理学院 钱洋 email:1563178220@qq.com 内容可能有不到之处,欢迎交流。






 //读取数据集public void prepareMatrix(String FileNameToRead,int PercentageOfDataToLEarnFrom) {BufferedReader br = null;try {br = new BufferedReader(new FileReader(FileNameToRead));int NumberOfColoumns = 0;// Reading Header  读取第一行{String line = br.readLine();StringTokenizer st = new StringTokenizer(line);boolean notNum = true;while (st.hasMoreElements()) {if (notNum) { //为true,添加属性Headers.add((String) st.nextElement());notNum = false;} else {st.nextElement(); //跳过数组notNum = true;}}}//最后添加一个类别----此时头共有7个Headers.add("Class");/** System.out.println("Lets Check Headers"); for(String temp:head){* System.out.println("\t "+temp); }*/NumberOfColoumns = Headers.size();coloumns = NumberOfColoumns;{// lets read coloumnsString line = br.readLine();while (line != null) {// System.out.print(line);StringTokenizer st = new StringTokenizer(line);// System.out.println("---- Split by space ------");int[] tempCol = new int[NumberOfColoumns];int tempIndex = 0;while (st.hasMoreElements()) {tempCol[tempIndex++] = Integer.parseInt((String) st.nextElement());// System.out.println(tempCol[tempIndex-1]);}rows.add(tempCol);line = br.readLine();}}// Now the truncating part,int rowsAfterTrunc = (int) ((PercentageOfDataToLEarnFrom * (rows.size())) / 100);if (rowsAfterTrunc == rows.size()) {// do nothing} else {for (int i = rows.size() - 1; i > rowsAfterTrunc; i--) {rows.remove(i);}}Numrows = rows.size();} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} finally {try {br.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}//数据封装成数组for (int i = 0; i < rows.size(); i++) {int[] arr = rows.get(i);for (int j = 0; j < arr.length; j++) {System.out.print(arr[j]);System.out.print("\t");}System.out.println("");}}


ArrayList<int[]> rows;//rows in matrix



/*** */
package com.qian.id3;import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/*** @author Anupam Gangotia* Profile::http://en.gravatar.com/gangotia* github::https://github.com/agangotia*/
/*** This class the learner class, which has the function*         to learn from the train values, and store it in a Decision tree* */
public class ID3Learner {String FileNameToRead;// filename to read the training data setint PercentageOfDataToLEarnFrom;// this is the percent , which shows how// many line to be read from input// Paramaterized constructorpublic ID3Learner(String FileName, int percs) {FileNameToRead = FileName;PercentageOfDataToLEarnFrom = percs;}/** Function :: startLearning, This function, reads the file values into a* MxN Matrix datatype, The data from Matrix is further split across a set* of Training Vectors, and a FinalClass Vector. As our ID3 algortihm takes* set of Training vectors & final class vector as inputs. This* function,internally calls the learnTree Function, which is an* implementation of ID3 algorithm*/public TreeNode startLearning() {if (FileNameToRead == null) {System.out.println("---- Error ------");System.out.println("---- Please Specify test data set ------");}if (PercentageOfDataToLEarnFrom < 0) {System.out.println("---- Error ------");System.out.println("---- Please Specify %correctly ------");}MatrixData matrix = new MatrixData();// Prepares a new matrix datatypematrix.prepareMatrix(FileNameToRead, PercentageOfDataToLEarnFrom);// reads// 训练数据,不包含决策变量HashMap<String, int[]> setTrainingVector = new HashMap<String, int[]>();// Now i need a set of R training vectors  矩阵的列数  这里只循环了自变量for (int i = 0; i < matrix.coloumns - 1; i++) {// Training Vectors being//按照列存数据int[] trainingVector = new int[matrix.Numrows];matrix.fillArray(trainingVector, i);setTrainingVector.put(matrix.Headers.get(i), trainingVector);}// 决策变量int[] FinalClass = new int[matrix.Numrows];matrix.fillArray(FinalClass, matrix.coloumns - 1);// final class vector//初始化TreeNodeTreeNode rootNode = new TreeNode();rootNode.setAtrvalue(-1);// since its a root node 种子节点// Calling the ID3 implementation algorithm 自变量 因变量 生成树  数据learnTree(setTrainingVector, FinalClass, rootNode, matrix);return rootNode;}/** Function :: startLearning, Recursive Function. AN exact copy of ID3* algorithm(http://en.wikipedia.org/wiki/ID3_algorithm) This function* generates a decision tree recursively. Parameters: 1.A Hashmap containing* Training Vectors :: HashMap<String,int[]> setTrainingVector 2.A VEctor of* Final class :: int[] FinalClass 3.THe decision tree NOde::TreeNode node* 4.The MAtrix datatype, which is used in constructing vectors of train* data::MatrixData matrix*/public void learnTree(HashMap<String, int[]> setTrainingVector,int[] FinalClass, TreeNode node, MatrixData matrix) {// 判断所有的数据对应的类别是否为单一类别if (checkFinalClass(FinalClass, 0)) {// If all examples are 0, Return// the single-node tree Root,// with label = 0.node.fClass = 0;return;} else if (checkFinalClass(FinalClass, 1)) {// If all examples are 1,node.fClass = 1;return;}// 如果只有一个属性的情况if (setTrainingVector.entrySet().size() == 1) {int cPos = getCountPositives(FinalClass);int cNeg = FinalClass.length - cPos;if (cPos >= cNeg) {node.fClass = 0;return;} else {node.fClass = 1;return;}} else {/*使用信息增益选择属性*/HashMap<String, Double> attributesGains = new HashMap<String, Double>(); // 存储每个特征对应的信息增益HashMap<String, ArrayList<Integer>> mapAttributesValuesInListUnique = new HashMap<String, ArrayList<Integer>>();// The//计算样本数据的熵double entropyS = getEntropy(FinalClass);// initial entropy//setTrainingVector为自变量,每个属性对应的一列值,通过一个一维数组存储for (Map.Entry entry : setTrainingVector.entrySet()) {//某属性不同值对应的正类HashMap<Integer, Integer> atrPositive = new HashMap<Integer, Integer>();//某属性不同值对应的负类HashMap<Integer, Integer> atrNegative = new HashMap<Integer, Integer>();ArrayList<Integer> atrUnique = new ArrayList<Integer>();//获取所有的样本的训练集int[] trainingClass = (int[]) entry.getValue();for (int i = 0; i < trainingClass.length; i++) {// NOw finding// individual// entropiesaddOnlyUnique(atrUnique, trainingClass[i]);if (FinalClass[i] == 0)// its a positive{  //将某一属性下,不同值对应的正类和负类的数量统计出来if (atrPositive.containsKey(trainingClass[i])) {atrPositive.put(trainingClass[i],atrPositive.get(trainingClass[i]) + 1);} else {atrPositive.put(trainingClass[i], 1);}} else {// FinalClass is negativeif (atrNegative.containsKey(trainingClass[i])) {atrNegative.put(trainingClass[i],atrNegative.get(trainingClass[i]) + 1);} else {atrNegative.put(trainingClass[i], 1);}}}mapAttributesValuesInListUnique.put((String) entry.getKey(),atrUnique);// 针对每个属性,计算熵{double gain = entropyS;//每个属性对应的不重复值for (int tempAttr : atrUnique) {double entropyTemp = 0.0;int positives = 0;int negatives = 0;//获取不重复值对应的正类if (atrPositive.get(tempAttr) != null)positives = atrPositive.get(tempAttr);//获取不重复值对应的负类if (atrNegative.get(tempAttr) != null)negatives = atrNegative.get(tempAttr);double val1 = (double) (positives)/ (positives + negatives);double val2 = (double) (negatives)/ (positives + negatives);//基于公式计算信息熵entropyTemp = -(val1 * log2(val1))- (val2 * log2(val2));//累计计算信息增益值gain = gain- ((((double) positives + negatives) / trainingClass.length) * entropyTemp);}//封装该属性的信息增益值attributesGains.put((String) entry.getKey(), gain);}}// loop ends/**使用Map排序算法,这里是作者写的,*也可以直接调用  Collections.sort()进行排序**找出使得信息增益最大的属性**/String attributeWithMAxGain = "";double maxGainValue = 0.0;int indexToChoose = 0;for (Map.Entry entry : setTrainingVector.entrySet()) {double tempGain = attributesGains.get((String) entry.getKey());if (indexToChoose == 0) {maxGainValue = tempGain;attributeWithMAxGain = (String) entry.getKey();indexToChoose++;}if (tempGain > maxGainValue) {maxGainValue = tempGain;attributeWithMAxGain = (String) entry.getKey();}}// loop ends//节点添加node.setAttributeName(attributeWithMAxGain);node.setfClass(-1);node.setGain(maxGainValue);//下面,开始采用递归的方式向下计算ArrayList<Integer> atrUniqueValuesForAttrMaxGain = mapAttributesValuesInListUnique.get(attributeWithMAxGain);for (int tempAtrUniqueValue : atrUniqueValuesForAttrMaxGain) {TreeNode NodeChild = new TreeNode();NodeChild.setAtrvalue(tempAtrUniqueValue);// since its a child// nodenode.getBranches().add(NodeChild);MatrixData matrixChild = matrix.splitMatrix(attributeWithMAxGain, tempAtrUniqueValue);// matrixChild.printMatrix();// calling the algorithmHashMap<String, int[]> setTrainingVectorChild = new HashMap<String, int[]>();// Now i need a set of R training vectorsfor (int i = 0; i < matrixChild.coloumns - 1; i++) {int[] trainingVectorChild = new int[matrixChild.Numrows];matrixChild.fillArray(trainingVectorChild, i);setTrainingVectorChild.put(matrixChild.Headers.get(i),trainingVectorChild);}// i need final class vectorint[] FinalClassChild = new int[matrixChild.Numrows];matrixChild.fillArray(FinalClassChild, matrixChild.coloumns - 1);learnTree(setTrainingVectorChild, FinalClassChild, NodeChild,matrixChild);}return;}}// Function:checkFinalClass// Returns True or False// If all the attributes in final class equals valueToChecked returns Truepublic boolean checkFinalClass(int[] FinalClass, int valueToChecked) {for (int i = 0; i < FinalClass.length; i++) {if (FinalClass[i] != valueToChecked)return false;}return true;}// Function:getCountPositives// Returns the count of positives in final classpublic int getCountPositives(int[] FinalClass) {int countPos = 0;for (int i = 0; i < FinalClass.length; i++) {if (FinalClass[i] == 0)countPos++;}return countPos;}// 计算样本数据对应的信息熵public double getEntropy(int[] vector) {double entropy = 0.0;int positives = 0;int negatives = 0;for (int i = 0; i < vector.length; i++) {if (vector[i] == 0)// its a positive{positives++;} else {// FinalClass is negativenegatives++;}}double val1 = (double) (positives) / (positives + negatives);double val2 = (double) (negatives) / (positives + negatives);entropy = -(val1 * log2(val1)) - (val2 * log2(val2));return entropy;}// Function:log2// Returns log base 2public static double log2(double num) {if (num <= 0)return 0.0;return (Math.log(num) / Math.log(2));}// Function:addOnlyUnique// Adds a value to the arraylist only if does not exists in the listpublic void addOnlyUnique(ArrayList<Integer> data, int val) {if (!data.contains(val))data.add(val);}}


