-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathrandomForest_.daph
159 lines (143 loc) · 6.89 KB
/
randomForest_.daph
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Modifications 2024 The DAPHNE Consortium.
#
#-------------------------------------------------------------
# This script has been manually translated from Apache SystemDS (https://github.com/apache/systemds).
# Original file: scripts/builtin/decisionTree.dml @ afe7077ee1fa91d47aee6c985125f56ea80f8f32.
# This script implements random forest for recoded and binned categorical and
# numerical input features. In detail, we train multiple CART (classification
# and regression trees) decision trees in parallel and use them as an ensemble.
# classifier/regressor. Each tree is trained on a sample of observations (rows)
# and optionally subset of features (columns). During tree construction, split
# candidates are additionally chosen on a sample of remaining features.
#
# .. code-block::
#
# For example, given a feature matrix with features [a,b,c,d]
# and the following two trees, M (the output) would look as follows:
#
# (L1) |a<7| |d<5|
# / \ / \
# (L2) |c<3| |b<4| |a<7| P3:2
# / \ / \ / \
# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1
# --> M :=
# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree)
# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree)
# |(L1)| | (L2) | | (L3) |
#
# With feature sampling (feature_frac < 1), each tree is
# prefixed by a one-hot vector of sampled features
# (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# y Label matrix in recoded/binned representation
# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
# of shape 1-by-(ncol(X)+1), where the last entry is the y type
# num_trees Number of trees to be learned in the random forest model
# sample_frac Sample fraction of examples for each tree in the forest
# feature_frac Sample fraction of features for each tree in the forest
# max_depth Maximum depth of the learned tree (stopping criterion)
# min_leaf Minimum number of samples in leaf nodes (stopping criterion)
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
# max_values Parameter controlling the number of values per feature used
# as split candidates: nb = ceil(num_values^max_values)
# impurity Impurity measure: entropy, gini (default), rss (regression)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# M Matrix M containing the learned trees, in linearized form.
# ------------------------------------------------------------------------------
import "decisionTree_.daph";
# TODO Support optional parameters with defaults (see #548).
def randomForest(X:matrix<f64>, y:matrix<f64>, ctypes:matrix<f64>,
num_trees:si64 /*= 16*/, sample_frac:f64 /*= 0.1*/, feature_frac:f64 /*= 1.0*/,
max_depth:si64 /*= 10*/, min_leaf:si64 /*= 20*/, min_split:si64 /*= 50*/,
max_features:f64 /*= 0.5*/, max_values:f64 /*= 1.0*/,
impurity:str /*= "gini"*/, seed:si64 /*= -1*/, verbose:bool /*= false*/) -> matrix<f64>
{
t1 = now();
# validation and initialization of reproducible seeds
if(verbose) {
print("randomForest: initialize with num_trees=" + num_trees + ", sample_frac=" + sample_frac
+ ", feature_frac=" + feature_frac + ", impurity=" + impurity + ", seed=" + seed + ".");
}
if(ncol(ctypes) != ncol(X)+1)
stop("randomForest: inconsistent num features (incl. label) and col types: "+ncol(X)+" vs "+ncol(ctypes)+".");
if( sum(X<=0) != 0 )
stop("randomForest: feature matrix X is not properly recoded/binned: "+sum(X<=0));
if(sum(y <= 0) != 0)
stop("randomForest: y is not properly recoded and binned (contiguous positive integers).");
if(aggMax(y) == 1)
stop("randomForest: y contains only one class label.");
lseed = as.scalar<si64>(seed!=-1 ? seed : as.scalar(rand(1,1,0,as.si64(1e9),1,-1)));
randSeeds = rand(3 * num_trees, 1, 0, as.si64(1e9), 1, lseed);
# training of num_tree decision trees
M = fill(0.0, num_trees, 2*(2^max_depth - 1));
F = fill(1.0, num_trees, ncol(X));
# TODO Support parfor-loops (see #515).
for(i in 1:num_trees) {
if( verbose )
print("randomForest: start training tree "+i+"/"+num_trees+".");
# step 1: sample data
Xi = X; yi = y;
if( sample_frac < 1.0 ) {
si1 = as.si64(as.scalar(randSeeds[3*(i - 1),0]));
I1 = rand(nrow(X), 1, 0.0, 1.0, 1, si1) <= sample_frac;
if( sum(I1) <= 1 ) # min 2 tuples
# TODO .0 should not be necessary.
I1[0:2,] = fill(1.0,2,1);
Xi = X[[I1,]];
yi = y[[I1,]];
}
# step 2: sample features
if( feature_frac < 1.0 ) {
si2 = as.si64(as.scalar(randSeeds[3*(i - 1)+1,0]));
I2 = rand(ncol(X), 1, 0.0, 1.0, 1, si2) <= feature_frac;
Xi = Xi[[, I2]];
F[i - 1,] = t(I2);
}
if( verbose )
print("-- ["+i+"] sampled "+nrow(Xi)+"/"+nrow(X)+" rows and "+ncol(Xi)+"/"+ncol(X)+" cols.");
# step 3: train decision tree
t2 = now();
si3 = as.si64(as.scalar(randSeeds[3*(i - 1)+2,0]));
Mtemp = decisionTree_.decisionTree(Xi, yi, ctypes, max_depth, min_leaf, min_split,
max_features, max_values, /*max_dataratio=*/0.25,
impurity, si3, verbose);
M[i - 1, 0:ncol(Mtemp)] = reshape(Mtemp, 1, ncol(Mtemp));
if( verbose )
print("-- ["+i+"] trained decision tree in "+(as.f64(now()-t2)/1e9)+" seconds.");
}
M = cbind(F, M);
if(verbose) {
print("randomForest: trained ensemble with num_trees="+num_trees+" in "+(as.f64(now()-t1)/1e9)+" seconds.");
}
return M;
}