Example 3: RandomTrees

This example builds a random forest with C45 decision trees on simulated categorical data. A variable importance measure based on the change in predictive accuracy is highest for variable 0.


import com.imsl.math.*;
import com.imsl.stat.*;
import com.imsl.datamining.decisionTree.*;

public class RandomTreesEx3 {

    public static void main(String[] args) throws Exception {
        double[][] sim0XY = {
            {2, 25.92869, 0, 0}, {1, 51.63245, 1, 1}, {1, 25.78432, 0, 2},
            {0, 39.37948, 0, 3}, {2, 24.65058, 0, 2}, {2, 45.20084, 0, 2},
            {2, 52.67960, 1, 3}, {1, 44.28342, 1, 3}, {2, 40.63523, 1, 3},
            {2, 51.76094, 0, 3}, {2, 26.30368, 0, 1}, {2, 20.70230, 1, 0},
            {2, 38.74273, 1, 3}, {2, 19.47333, 0, 0}, {1, 26.42211, 0, 0},
            {2, 37.05986, 1, 0}, {1, 51.67043, 1, 3}, {0, 42.40156, 0, 3},
            {2, 33.90027, 1, 2}, {1, 35.43282, 0, 0}, {1, 44.30369, 0, 1},
            {0, 46.72387, 0, 2}, {1, 46.99262, 0, 2}, {0, 36.05923, 0, 3},
            {2, 36.83197, 1, 1}, {1, 61.66257, 1, 2}, {0, 25.67714, 0, 3},
            {1, 39.08567, 1, 0}, {0, 48.84341, 1, 1}, {1, 39.34391, 0, 3},
            {2, 24.73522, 0, 2}, {1, 50.55251, 1, 3}, {0, 31.34263, 1, 3},
            {1, 27.15795, 1, 0}, {0, 31.72685, 0, 2}, {0, 25.00408, 0, 3},
            {1, 26.35457, 1, 3}, {2, 38.12343, 0, 1}, {0, 49.94030, 0, 2},
            {1, 42.45779, 1, 3}, {0, 38.80948, 1, 1}, {0, 43.22799, 1, 1},
            {0, 41.87624, 0, 3}, {2, 48.07820, 0, 2}, {0, 43.23673, 1, 0},
            {2, 39.41294, 0, 3}, {1, 23.93346, 0, 2}, {2, 42.84130, 1, 3},
            {2, 30.40669, 0, 1}, {0, 37.77389, 0, 2}
        };
        
        DecisionTree.VariableType[] sim0VarType = {
            DecisionTree.VariableType.CATEGORICAL,
            DecisionTree.VariableType.QUANTITATIVE_CONTINUOUS,
            DecisionTree.VariableType.CATEGORICAL,
            DecisionTree.VariableType.CATEGORICAL
        };
        
        int sim0ResponseIdx = 0, n = sim0XY.length;

        double[] knownY = new double[n];
        for (int i = 0; i < n; i++) {
            knownY[i] = sim0XY[i][sim0ResponseIdx];
        }

        C45 dt = new C45(sim0XY, sim0ResponseIdx, sim0VarType);
        RandomTrees rf = new RandomTrees(dt);
        rf.setRandomObject(new Random(123457));
        rf.setCalculateVariableImportance(true);
        rf.fitModel();

        double[] outOfBagPredictions = rf.getOutOfBagPredictions();
        int[][] classErrors = rf.getClassErrors(knownY, outOfBagPredictions);
        double[] variableImportance = rf.getVariableImportance();

        new PrintMatrix("C45 Random Forest class errors:").
                print(classErrors);

        new PrintMatrix("C45 Random Forest variable importance:").
                print(variableImportance);
    }
}

Output

C45 Random Forest class errors:
   0   1   
0  13  15  
1  16  16  
2  13  19  
3  42  50  

C45 Random Forest variable importance:
     0     
0  -0.018  
1  -0.002  
2  -0.007  

Link to Java source.