Skip to content

Commit af225ad

Browse files
authored
fix(RF): performance of feature sampling for node splits (#2292)
* disable bottlenecks caused by memorySavingMode=false * feat: add drawSample function with O(k) runtime * Use drawKFromBufferWithoutReplacement in df training * sample directly in findBestSplit function to make it O(_maxFeatures) * rename service_memset_sequential -> service_memset_incrementing * feat: status checks for node splitting algorithms
1 parent 5e080cf commit af225ad

File tree

4 files changed

+169
-63
lines changed

4 files changed

+169
-63
lines changed

cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i

+126-61
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,15 @@ private:
115115
WorkItem * _data; // array of heap elements, max element is on the left
116116
};
117117

118+
//////////////////////////////////////////////////////////////////////////////////////////
119+
// Service structure, node split error & splitting status
120+
//////////////////////////////////////////////////////////////////////////////////////////
121+
struct NodeSplitResult
122+
{
123+
services::Status status;
124+
bool bSplitSucceeded;
125+
};
126+
118127
//////////////////////////////////////////////////////////////////////////////////////////
119128
// Service structure, contains numeric tables to be calculated as result
120129
//////////////////////////////////////////////////////////////////////////////////////////
@@ -595,14 +604,14 @@ protected:
595604
algorithmFPType imp);
596605
typename DataHelper::NodeType::Leaf * makeLeaf(const IndexType * idx, size_t n, typename DataHelper::ImpurityData & imp, size_t makeLeaf);
597606

598-
bool findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
599-
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
600-
bool findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
601-
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
602-
bool findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
603-
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
604-
bool simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
605-
typename DataHelper::TSplitData & split);
607+
NodeSplitResult findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
608+
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
609+
NodeSplitResult findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
610+
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
611+
NodeSplitResult findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
612+
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
613+
NodeSplitResult simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
614+
typename DataHelper::TSplitData & split);
606615
void addImpurityDecrease(IndexType iFeature, size_t n, const typename DataHelper::ImpurityData & curImpurity,
607616
const typename DataHelper::TSplitData & split);
608617

@@ -619,7 +628,7 @@ protected:
619628
const size_t nGen = (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures) ? n : _nFeaturesPerNode;
620629
*_numElems += n;
621630
RNGs<IndexType, cpu> rng;
622-
rng.uniformWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), 0, n);
631+
rng.drawKFromBufferWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), n);
623632
}
624633

625634
services::Status computeResults(const dtrees::internal::Tree & t);
@@ -683,16 +692,18 @@ services::Status TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, c
683692
_aFeatureBuf.reset(_nFeatureBufs);
684693
_aFeatureIndexBuf.reset(_nFeatureBufs);
685694

686-
if (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures)
687-
{
688-
_aFeatureIdx.reset(maxFeatures * 2); // maxFeatures elements are used by algorithm, others are used internally by generator
689-
_aConstFeatureIdx.reset(maxFeatures * 2); // first maxFeatures elements are used for saving indices of constant features,
690-
// the other part are used for saving levels of this features
691-
DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
692-
services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), maxFeatures * 2);
693-
}
694-
else
695-
_aFeatureIdx.reset(_nFeaturesPerNode * 2); // _nFeaturesPerNode elements are used by algorithm, others are used internally by generator
695+
/* first maxFeatures entries serve as a buffer of drawn samples for node splitting */
696+
/* second maxFeatures entries contains [0, ..., maxFeatures - 1] and is used to randomly draw indices */
697+
_aFeatureIdx.reset(maxFeatures * 2);
698+
_aConstFeatureIdx.reset(maxFeatures * 2);
699+
700+
DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
701+
services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), 2 * maxFeatures);
702+
// in order to use drawKFromBufferWithoutReplacement we need to initialize
703+
// the buffer to contain all indices from [0, 1, ..., maxFeatures - 1]
704+
DAAL_CHECK_MALLOC(_aFeatureIdx.get());
705+
services::internal::service_memset_seq<IndexType, cpu>(_aFeatureIdx.get(), IndexType(0), maxFeatures);
706+
services::internal::service_memset_incrementing<IndexType, cpu>(_aFeatureIdx.get() + maxFeatures, IndexType(0), maxFeatures);
696707

697708
DAAL_CHECK_MALLOC(_aSample.get() && _helper.reset(_nSamples) && _helper.resetWeights(_nSamples) && _aFeatureBuf.get() && _aFeatureIndexBuf.get()
698709
&& _aFeatureIdx.get());
@@ -798,7 +809,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
798809

799810
typename DataHelper::TSplitData split;
800811
IndexType iFeature;
801-
if (findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights))
812+
813+
NodeSplitResult split_result = findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights);
814+
DAAL_ASSERT(split_result.status.ok());
815+
if (split_result.bSplitSucceeded)
802816
{
803817
const size_t nLeft = split.nLeft;
804818
const double imp = curImpurity.var;
@@ -844,6 +858,7 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
844858
DAAL_ASSERT(split.nLeft == right->count);
845859
return res;
846860
}
861+
847862
return makeLeaf(_aSample.get() + iStart, n, curImpurity, nClasses);
848863
}
849864

@@ -859,7 +874,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
859874
{
860875
return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
861876
}
862-
else if (findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights))
877+
878+
NodeSplitResult split_result = findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights);
879+
DAAL_ASSERT(split_result.status.ok());
880+
if (split_result.bSplitSucceeded)
863881
{
864882
const double imp = impurity.var;
865883
const double impLeft = split.left.var;
@@ -896,10 +914,8 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
896914
return item.node;
897915
}
898916
}
899-
else
900-
{
901-
return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
902-
}
917+
918+
return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
903919
}
904920

905921
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
@@ -1032,37 +1048,40 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
10321048
}
10331049

10341050
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
1035-
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
1036-
const typename DataHelper::ImpurityData & curImpurity,
1037-
IndexType & iFeatureBest,
1038-
typename DataHelper::TSplitData & split)
1051+
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
1052+
const typename DataHelper::ImpurityData & curImpurity,
1053+
IndexType & iFeatureBest,
1054+
typename DataHelper::TSplitData & split)
10391055
{
1056+
services::Status st;
10401057
RNGs<IndexType, cpu> rng;
10411058
algorithmFPType featBuf[2];
10421059
IndexType * aIdx = _aSample.get() + iStart;
10431060
for (size_t i = 0; i < _nFeaturesPerNode; ++i)
10441061
{
10451062
IndexType iFeature;
10461063
*_numElems += 1;
1047-
rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
1064+
int errorcode = rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
1065+
if (errorcode)
1066+
{
1067+
st = services::Status(services::ErrorNullResult);
1068+
}
10481069
featureValuesToBuf(iFeature, featBuf, aIdx, 2);
10491070
if (featBuf[1] - featBuf[0] <= _accuracy) //all values of the feature are the same
10501071
continue;
10511072
_helper.simpleSplit(featBuf, aIdx, split);
10521073
split.featureUnordered = _featHelper.isUnordered(iFeature);
10531074
split.impurityDecrease = curImpurity.var;
10541075
iFeatureBest = iFeature;
1055-
return true;
1076+
return { st, true };
10561077
}
1057-
return false;
1078+
return { st, false };
10581079
}
10591080

10601081
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
1061-
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(size_t level, size_t iStart, size_t n,
1062-
const typename DataHelper::ImpurityData & curImpurity,
1063-
IndexType & iFeatureBest,
1064-
typename DataHelper::TSplitData & split,
1065-
algorithmFPType totalWeights)
1082+
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(
1083+
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
1084+
typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
10661085
{
10671086
if (n == 2)
10681087
{
@@ -1078,26 +1097,67 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
10781097

10791098
//find best split and put it to featureIndexBuf
10801099
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
1081-
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(size_t level, size_t iStart, size_t n,
1082-
const typename DataHelper::ImpurityData & curImpurity,
1083-
IndexType & iBestFeature,
1084-
typename DataHelper::TSplitData & bestSplit,
1085-
algorithmFPType totalWeights)
1100+
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(
1101+
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
1102+
typename DataHelper::TSplitData & bestSplit, algorithmFPType totalWeights)
10861103
{
1087-
chooseFeatures();
1088-
size_t nVisitedFeature = 0;
1089-
const size_t maxFeatures = nFeatures();
1090-
const float qMax = 0.02; //min fracture of observations to be handled as indexed feature values
1091-
IndexType * bestSplitIdx = featureIndexBuf(0) + iStart;
1092-
IndexType * aIdx = _aSample.get() + iStart;
1093-
int iBestSplit = -1;
1094-
int idxFeatureValueBestSplit = -1; //when sorted feature is used
1104+
services::Status st;
1105+
1106+
/* counter of the number of visited features, we visit _nFeaturesPerNode
1107+
* depending on _useConstFeatures, constant features can be skipped
1108+
*/
1109+
size_t nVisitedFeature = 0;
1110+
/* total number of features */
1111+
const size_t maxFeatures = nFeatures();
1112+
/* minimum fraction of all samples per bin */
1113+
const algorithmFPType qMax = 0.02;
1114+
/* index of the best split, initialized to first index we investigate */
1115+
IndexType * bestSplitIdx = featureIndexBuf(0) + iStart;
1116+
/* sample index */
1117+
IndexType * aIdx = _aSample.get() + iStart;
1118+
/* zero-based index of best split */
1119+
int64_t iBestSplit = -1;
1120+
int64_t idxFeatureValueBestSplit = -1;
10951121
typename DataHelper::TSplitData split;
1096-
const float fact = float(n);
1122+
/* RNG for sample drawing */
1123+
RNGs<IndexType, cpu> rng;
1124+
/* index for swapping samples in Fisher-Yates sampling */
1125+
IndexType swapIdx;
1126+
10971127
for (size_t i = 0; i < maxFeatures && nVisitedFeature < _nFeaturesPerNode; ++i)
10981128
{
1099-
const auto iFeature = _aFeatureIdx[i];
1100-
const bool bUseIndexedFeatures = (!_par.memorySavingMode) && (fact > qMax * float(_helper.indexedFeatures().numIndices(iFeature)));
1129+
/* draw a random sample without replacement */
1130+
// based on Fisher Yates sampling
1131+
// _aFeatureIdx has length of 2 * _maxFeatures
1132+
// first maxFeatures contain the currently selected features
1133+
// at iteration i, we have drawn i features and written them to
1134+
// _aFeatureIdx[0, 1, ..., i-1]
1135+
//
1136+
// the second half of the buffer contains all numbers from
1137+
// [0, 1, ..., maxFeatures-1] and we randomly select one without
1138+
// replacement based on Fisher Yates sampling
1139+
// drawing uniformly from [0, maxFeatures-i] and swapping the indices
1140+
// assures uniform probability of all drawn numbers
1141+
1142+
/* draw the i-th index of the sample */
1143+
int errorcode = rng.uniform(1, &swapIdx, _engineImpl->getState(), 0, maxFeatures - i);
1144+
if (errorcode)
1145+
{
1146+
st = services::Status(services::ErrorNullResult);
1147+
}
1148+
1149+
/* account for buffer offset from 0 */
1150+
swapIdx += maxFeatures;
1151+
/* _aFeatureIdx[swapIdx] was drawn */
1152+
_aFeatureIdx[i] = _aFeatureIdx[swapIdx];
1153+
/* swap in number at [2 * maxFeatures - 1 - i] for next draw */
1154+
_aFeatureIdx[swapIdx] = _aFeatureIdx[2 * maxFeatures - 1 - i];
1155+
/* store drawn number at end of number buffer so that no number is lost */
1156+
_aFeatureIdx[2 * maxFeatures - 1 - i] = _aFeatureIdx[i];
1157+
1158+
const auto iFeature = _aFeatureIdx[i];
1159+
const bool bUseIndexedFeatures =
1160+
(!_par.memorySavingMode) && (algorithmFPType(n) > qMax * algorithmFPType(_helper.indexedFeatures().numIndices(iFeature)));
11011161

11021162
if (!_maxLeafNodes && !_useConstFeatures && !_par.memorySavingMode)
11031163
{
@@ -1154,7 +1214,14 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
11541214
#endif
11551215
}
11561216
}
1157-
if (iBestSplit < 0) return false; //not found
1217+
1218+
if (!st.ok() || iBestSplit < 0)
1219+
{
1220+
// either:
1221+
// error during splitting -> failure
1222+
// or no split found -> not a failure but still have to return
1223+
return { st, false };
1224+
}
11581225

11591226
iBestFeature = _aFeatureIdx[iBestSplit];
11601227
bool bCopyToIdx = true;
@@ -1193,20 +1260,18 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
11931260
bCopyToIdx = (iBestSplit + 1 < _nFeaturesPerNode); //if iBestSplit is the last considered feature
11941261
//then aIdx already contains the best split, no need to copy
11951262
if (bCopyToIdx) services::internal::tmemcpy<IndexType, cpu>(aIdx, bestSplitIdx, n);
1196-
return true;
1263+
return { st, true };
11971264
}
11981265

11991266
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
1200-
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(size_t level, size_t iStart, size_t n,
1201-
const typename DataHelper::ImpurityData & curImpurity,
1202-
IndexType & iFeatureBest,
1203-
typename DataHelper::TSplitData & split,
1204-
algorithmFPType totalWeights)
1267+
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(
1268+
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
1269+
typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
12051270
{
12061271
chooseFeatures();
12071272
TArray<typename DataHelper::TSplitData, cpu> aFeatureSplit(_nFeaturesPerNode);
12081273
//TODO, if parallel for features
1209-
return false;
1274+
return { services::Status(services::ErrorMethodNotSupported), false };
12101275
}
12111276

12121277
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>

cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ Parameter::Parameter()
5656
minWeightFractionInLeafNode(0.),
5757
minImpurityDecreaseInSplitNode(0.),
5858
maxLeafNodes(0),
59-
minBinSize(5),
60-
maxBins(256)
59+
maxBins(256),
60+
minBinSize(5)
6161
{}
6262
} // namespace interface2
6363
Status checkImpl(const decision_forest::training::interface2::Parameter & prm)

cpp/daal/src/externals/service_memory.h

+13
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)
127127
return ptr;
128128
}
129129

130+
/* Initialize block of memory of length num value */
130131
template <typename T, CpuType cpu>
131132
void service_memset_seq(T * const ptr, const T value, const size_t num)
132133
{
@@ -138,6 +139,18 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)
138139
}
139140
}
140141

142+
/* Initialize block of memory of length num with entries [startValue, ..., startValue + num -1]*/
143+
template <typename T, CpuType cpu>
144+
void service_memset_incrementing(T * const ptr, const T startValue, const size_t num)
145+
{
146+
PRAGMA_IVDEP
147+
PRAGMA_VECTOR_ALWAYS
148+
for (size_t i = 0; i < num; i++)
149+
{
150+
ptr[i] = startValue + i;
151+
}
152+
}
153+
141154
} // namespace internal
142155
} // namespace services
143156
} // namespace daal

0 commit comments

Comments
 (0)