-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathdecisionTree_.daph
382 lines (348 loc) · 16.1 KB
/
decisionTree_.daph
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Modifications 2024 The DAPHNE Consortium.
#
#-------------------------------------------------------------
# This script has been manually translated from Apache SystemDS (https://github.com/apache/systemds).
# Original file: scripts/builtin/decisionTree.dml @ 23177b7779f1868fd51c7af8f98e2ab8221b9011.
# This script implements decision trees for recoded and binned categorical and
# numerical input features. We train a single CART (classification and
# regression tree) decision trees depending on the provided labels y, either
# classification (majority vote per leaf) or regression (average per leaf).
#
# .. code-block::
#
# For example, give a feature matrix with features [a,b,c,d]
# and the following trees, M would look as follows:
#
# (L1) |d<5|
# / \
# (L2) P1:2 |a<7|
# / \
# (L3) P2:2 P3:1
#
# --> M :=
# [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
# |(L1)| | (L2) | | (L3) |
#
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# y Label matrix in recoded/binned representation
# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
# of shape 1-by-(ncol(X)+1), where the last entry is the y type
# max_depth Maximum depth of the learned tree (stopping criterion)
# min_leaf Minimum number of samples in leaf nodes (stopping criterion),
# odd number recommended to avoid 50/50 leaf label decisions
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
# max_values Parameter controlling the number of values per feature used
# as split candidates: nb = ceil(num_values^max_values)
# max_dataratio Parameter in [0,1] controlling when to materialize data
# subsets of X and y on node splits. When set to 0, we always
# scan the original X and y, which has the benefit of avoiding
# the allocation and maintenance of data for all active nodes.
# When set to 0.01 we rematerialize whenever the sub-tree data
# would be less than 1% of last the parent materialize data size.
# impurity Impurity measure: entropy, gini (default), rss (regression)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# M Matrix M containing the learned trees, in linearized form
# ------------------------------------------------------------------------------
def computeLeafLabel(y2:matrix<f64>, I:matrix<f64>, classify:bool, verbose:bool) -> f64
{
# TODO The ops in this function seem to get duplicated (e.g., print happens twice).
f = (I @ y2) / sum(I);
# TODO as.f64() should not be necessary.
# TODO <f64> should not be necessary.
# TODO DAPHNE crashes when we use ?: here, thus we rewrite it to if-then-else.
#label = as.scalar<f64>(classify ? as.f64(idxMax(f, 0) + 1) : (f @ seq(1,ncol(f),1)));
# TODO Initializing label before the if-then-else should not be necessary.
label = 0.0;
if(classify)
label = as.f64(idxMax(f, 0)) + 1;
else
label = f @ seq(1,ncol(f),1);
label = as.scalar(label);
if(verbose)
print("-- leaf node label: " + label +" ("+sum(I)*aggMax(f)+"/"+sum(I)+")");
return label;
}
def computeImpurity(y2:matrix<f64>, I:matrix<f64>, impurity:str) -> matrix<f64>
{
f = (I @ y2) / sum(I, 0); # rel. freq. per category/bin
# TODO The value should be allowed to be `0`, but that results in an error later on.
score = fill(0.0, nrow(I), 1);
if( impurity == "gini" )
score = 1 - sum(f^2, 0); # sum(f*(1-f));
else if( impurity == "entropy" )
# TODO Unary "-".
score = sum((0-f) * log(f, 2), 0);
else if( impurity == "rss" ) { # residual sum of squares
yhat = f @ seq(1,ncol(f),1); # yhat
res = outerSub(yhat, t(idxMax(y2, 0) + 1)); # yhat-y
score = sum((I * res)^2, 0); # sum((yhat-y)^2)
}
else
stop("decisionTree: unsupported impurity measure: "+impurity);
return score;
}
def findBestSplit (X2:matrix<f64>, y2:matrix<f64>, foffb:matrix<f64>, foffe:matrix<f64>,
ID:si64, I:matrix<f64>, min_leaf:si64, max_features:f64, max_values:f64, impurity:str, seed:si64)
-> si64, si64, si64, matrix<f64>, si64, matrix<f64>
{
# sample features iff max_features < 1
n = ncol(foffb);
numI = sum(I);
feat = seq(1,n,1);
if( max_features < 1.0 ) {
# TODO This cannot generate 0.0, as sparsity is set to 1; is that okay?
rI = rand(n, 1, 0.0, 1.0, 1, seed) <= (n^max_features/n);
feat = feat[[rI, ]];
# TODO Problem: feat can have zero rows now -> what would SystemDS do?
if( sum(feat) == 0 ) #sample at least one
# TODO as.si64() should not be necessary.
feat[0,0] = round(rand(1, 1, 1, as.si64(n), 1, -1));
}
# evaluate features and feature splits
# (both categorical and numerical are treated similarly by
# finding a cutoff point in the recoded/binned representation)
# TODO .0 should not be necessary, omiting it requires special instantiation of insertCol.
R = fill(0.0, 3, nrow(feat));
# TODO Support parfor-loops (see #515).
for( i in 1:nrow(feat) ) {
f = as.scalar(feat[i - 1, ]); # feature
beg = as.scalar(foffb[0,f - 1])+1; # feature start in X2
end = as.scalar(foffe[0,f - 1]); # feature end in X2
belen = end-beg; #numFeat - 1
# TODO Do we need this in DAPHNE?
#while(FALSE){} # make beg/end known
# construct 0/1 predicate vectors with <= semantics
# find rows that match at least one value and appear in I
# (vectorized evaluation, each column in P is a split candidate)
# Note: Compared to SystemDS, we use a manual rewrite of the program in DAPHNE.
# In a direct translation, the matrix multiplication `X2 @ P` is the most
# expensive operation in this decision trees implementation. However, it can
# be rewritten as a cumsum along the rows on a column fragment of `X2`. This
# makes the algorithm significantly faster in DAPHNE (on a dense data
# representation). Understanding if this rewrite would also be beneficial in
# SystemDS would require further investigation, especially since SystemDS most
# likely uses sparse data representations at criticial points here.
# Consequently, a few lines below are commented out, since they are not
# needed anymore after this manual rewrite. Furthermore, `ncol(fP)` has
# been replaced by `belen`, since `fP` does not exist anymore after that
# manual rewrite.
# fP = upperTri(fill(1,belen,belen), true, true);
vI = seq(1,belen,1);
rI2 = fill(1, belen, 1);
sampledFeatVals = false;
if( max_values < 1.0 && belen>10 ) {
# This cannot generate 0.0, as sparsity is set to 1; is that okay?
rI2 = as.si64(rand(belen, 1, 0.0, 1.0, 1, seed) <= (belen^max_values/belen));
# fP = fP[[, rI2]];
vI = vI[[rI2, ]];
sampledFeatVals = true;
}
# Original from SystemDS.
# P = fill(0, ncol(X2), ncol(fP));
# P[beg - 1:end - 1,0:ncol(fP)] = fP;
# Ileft = (t(X2 @ P) * I) != 0;
# Manual rewrite for DAPHNE.
X2_ = X2[, beg - 1:end - 1];
if(sampledFeatVals)
X2_ = X2_[[, rI2]];
Ileft = (cumSum(t(X2_)) * I) != 0;
Iright = (Ileft==0) * I;
# compute information gain for all split candidates
ig = as.scalar(computeImpurity(y2, I, impurity))
- sum(Ileft, 0)/numI * computeImpurity(y2, Ileft, impurity)
- sum(Iright, 0)/numI * computeImpurity(y2, Iright, impurity);
ig = replace(ig, nan, 0);
# track best split value and index, incl validity
valid = (sum(Ileft, 0)>=min_leaf) && (sum(Iright, 0)>=min_leaf);
bestig = aggMax(valid*ig);
# TODO .0 should not be necessary.
bestv = (bestig>0) ? nrow(valid)-as.scalar(idxMax(t(reverse(valid*ig)), 0) + 1)+beg : -1.0;
if( bestv >= 0 )
bestv = as.scalar(vI[bestv - beg,0])+beg - 1;
R[,i - 1] = rbind(rbind(as.matrix(f), as.matrix(bestig)), as.matrix(bestv));
}
ix = as.scalar(idxMax(R[1,], 0) + 1);
# extract indicators and IDs
IDleft = 2 * ID;
IDright= 2 * ID + 1;
f = as.scalar<si64>(feat[ix - 1,0]);
beg = as.scalar(foffb[0,f - 1]);
v = as.si64(as.scalar(R[2,ix - 1])-beg);
Ileft = [0.0]; # TODO this should not be necessary
Iright = [0.0]; # TODO this should not be necessary
if( aggMax(R[1,]) > 0 ) {
# TODO .0 in fill() shouldn't be necessary, omiting it requires special ctable instantiation.
p = ctable(seq(beg+1, beg+v, 1) - 1, fill(1.0, v, 1) - 1, ncol(X2), 1);
Ileft = (t(X2 @ p) * I) != 0;
Iright = I * (Ileft==0);
}
else { # no information gain
# TODO .0 should not be necessary here.
Ileft = [0.0];
Iright = [0.0];
}
return f, v, IDleft, Ileft, IDright, Iright;
}
# TODO Support optional parameters with defaults (see #548).
def decisionTree(X:matrix<f64>, y:matrix<f64>, ctypes:matrix<f64>,
max_depth:si64 /*= 10*/, min_leaf:si64 /*= 20*/, min_split:si64 /*= 50*/,
max_features:f64 /*= 0.5*/, max_values:f64 /*= 1.0*/, max_dataratio /*= 0.25*/,
impurity:str /*= "gini"*/, seed:si64 /*= -1*/, verbose:bool /*= false*/)
-> matrix<f64>
{
t1 = now();
# validation checks
if( max_depth > 32 )
stop("decisionTree: invalid max_depth > 32: "+max_depth);
if( sum(X<=0) != 0 )
stop("decisionTree: feature matrix X is not properly recoded/binned (values <= 0): "+sum(X<=0));
if( sum(abs(X-round(X))>1e-14) != 0 )
stop("decisionTree: feature matrix X is not properly recoded/binned (non-integer): "+sum(abs(X-round(X))>1e-14));
if( sum(y<=0) != 0 )
stop("decisionTree: label vector y is not properly recoded/binned: "+sum(y<=0));
if(ncol(X) + 1 != ncol(ctypes))
stop("decisionTree: row-vector of column types ctypes must have must have one entry per column of feature matrix X and one additional entry for the label vector y");
# TODO Check if shapes are valid, e.g., if ctypes is a row vector (would also help SystemDS).
# TODO Check if the values in ctypes are all 1 or 2 (would also help SystemDS).
# initialize input data and basic statistics
# (we keep y2 and the indicates I in transposed form for sparsity exploitation)
m = nrow(X); n = ncol(X);
classify = (as.scalar(ctypes[0,n]) == 2);
fdom = max(aggMax(X, 1),2); # num distinct per feature
foffb = t(cumSum(t(fdom))) - fdom; # feature begin
foffe = t(cumSum(t(fdom))); # feature end
# TODO .0 should not be necessary here.
rix = reshape(seq(1.0,m,1)@fill(1,1,n), m*n, 1);
cix = reshape(X + foffb, m*n, 1);
# TODO It's hard for users to understand that the weight must be 1.0 (not 1) to make it a matrix<f64>.
X2 = ctable(rix - 1, cix - 1, 1.0, m, as.scalar(foffe[,n - 1])); #one-hot encoded
# TODO Cast should not be necessary.
# TODO .0 in seq() should not be necessary, omiting it requires special ctable instantiation.
y2 = as.matrix<f64>(ctable(seq(0.0,m - 1,1), y - 1));
cnt = sum(X2, 1);
# TODO .0 should not be necessary.
I = fill(1.0, 1, nrow(X));
if( verbose ) {
print("decisionTree: initialize with max_depth=" + max_depth + ", max_features="
+ max_features +", max_dataratio=" + max_dataratio + ", impurity="
+ impurity + ", seed=" + seed + ".");
print("decisionTree: basic statistics:");
print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
print("-- minFeatureCount: " + aggMin(cnt));
print("-- maxFeatureCount: " + aggMax(cnt));
}
# queue-based node splitting
# TODO .0 should not be necessary.
M = fill(0.0, 1, 2*(2^max_depth - 1));
# TODO Support for lists at DSL-level (see #660).
# The DML script uses a "list of lists". As that is not supported in DaphneDSL yet,
# we (a) emulate lists by matrices to which we add and remove rows/columns (the "lists" part),
# and (b) split the data structure into its four components (the "of lists" part).
#queue = list(list(1,I,X2,y2)); # node IDs / data indicators
queue_nID = createList([1]);
queue_nI = createList(I);
queue_X2 = createList(X2);
queue_y2 = createList(y2);
# TODO .0 should not be necessary.
maxPath = 1.0;
while( length(queue_nID) > 0 ) {
# pop next node from queue for splitting
queue_nID, nIDmat = remove(queue_nID, 0);
# TODO <si64> should not be necessary here.
nID = as.scalar<si64>(nIDmat);
queue_nI, nI = remove(queue_nI, 0);
queue_X2, X2 = remove(queue_X2, 0);
queue_y2, y2 = remove(queue_y2, 0);
if(verbose)
print("decisionTree: attempting split of node "+nID+" ("+sum(nI)+" rows)");
# optional rematerialization of data per node
if( sum(nI) < max_dataratio*ncol(nI) ) {
if(verbose)
print("-- compacting data: "+ncol(nI)+" --> "+sum(nI));
X2 = X2[[t(nI), ]];
y2 = y2[[t(nI), ]];
nI = fill(1.0, 1, nrow(X2));
}
# find best split attribute
nSeed = (seed==-1) ? seed : seed*nID;
f, v, IDleft, Ileft, IDright, Iright = findBestSplit(
X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, impurity, nSeed);
validSplit = sum(Ileft) >= min_leaf && sum(Iright) >= min_leaf;
if(verbose)
print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
if( validSplit ) {
# TODO as.f64() should not be necessary, omiting it requires special instantiation of insertCol.
M[, 2*nID - 2:2*nID] = as.f64(t(rbind(as.matrix(f), as.matrix(v))));
}
else {
# TODO as.bool() should not be necessary, should be casted automatically
# TODO as.matrix() should not be necessary.
M[, 2*nID - 1] = as.matrix(computeLeafLabel(y2, nI, as.bool(classify), verbose));
}
maxPath = max(maxPath, floor(log(nID,2)+1));
# split data, finalize or recurse
if( validSplit ) {
if( sum(Ileft) >= min_split && floor(log(IDleft,2))+2 < max_depth ) {
queue_nID = append(queue_nID, as.matrix(IDleft));
queue_nI = append(queue_nI, Ileft);
queue_X2 = append(queue_X2, X2);
queue_y2 = append(queue_y2, y2);
}
else {
# TODO as.bool() should not be necessary, should be casted automatically (see #661).
# TODO as.matrix() should not be necessary.
M[,2*IDleft - 1] = as.matrix(computeLeafLabel(y2, Ileft, as.bool(classify), verbose));
}
if( sum(Iright) >= min_split && floor(log(IDright,2))+2 < max_depth ) {
queue_nID = append(queue_nID, as.matrix(IDright));
queue_nI = append(queue_nI, Iright);
queue_X2 = append(queue_X2, X2);
queue_y2 = append(queue_y2, y2);
}
else {
# TODO as.bool() should not be necessary, should be casted automatically (see #661).
# TODO as.matrix() should not be necessary.
M[,2*IDright - 1] = as.matrix(computeLeafLabel(y2, Iright, as.bool(classify), verbose));
}
maxPath = max(maxPath, floor(log(IDleft,2)+1));
}
}
# summary and encoding
M = M[0, 0:2*(2^maxPath - 1)];
if(verbose) {
print("decisionTree: final constructed tree (linearized):");
print("--", false);
print(M);
}
return M;
}