fix(RF): performance of feature sampling for node splits (#2292)

ahuber21 · web-flow · commit af225ad15bfb · 2023-03-24T20:20:06.000+01:00
* disable bottlenecks caused by memorySavingMode=false

* feat: add drawSample function with O(k) runtime

* Use drawKFromBufferWithoutReplacement in df training

* sample directly in findBestSplit function to make it O(_maxFeatures)

* rename service_memset_sequential -&gt; service_memset_incrementing

* feat: status checks for node splitting algorithms
diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
@@ -115,6 +115,15 @@ private:
     WorkItem * _data; // array of heap elements, max element is on the left
 };
 
+//////////////////////////////////////////////////////////////////////////////////////////
+// Service structure, node split error & splitting status
+//////////////////////////////////////////////////////////////////////////////////////////
+struct NodeSplitResult
+{
+    services::Status status;
+    bool bSplitSucceeded;
+};
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // Service structure, contains numeric tables to be calculated as result
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -595,14 +604,14 @@ protected:
                                                      algorithmFPType imp);
     typename DataHelper::NodeType::Leaf * makeLeaf(const IndexType * idx, size_t n, typename DataHelper::ImpurityData & imp, size_t makeLeaf);
 
-    bool findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
-                       typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
-    bool findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
-                             typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
-    bool findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
-                               typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
-    bool simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
-                     typename DataHelper::TSplitData & split);
+    NodeSplitResult findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
+                                  IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
+    NodeSplitResult findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
+                                        IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
+    NodeSplitResult findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
+                                          IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
+    NodeSplitResult simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
+                                typename DataHelper::TSplitData & split);
     void addImpurityDecrease(IndexType iFeature, size_t n, const typename DataHelper::ImpurityData & curImpurity,
                              const typename DataHelper::TSplitData & split);
 
@@ -619,7 +628,7 @@ protected:
         const size_t nGen = (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures) ? n : _nFeaturesPerNode;
         *_numElems += n;
         RNGs<IndexType, cpu> rng;
-        rng.uniformWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), 0, n);
+        rng.drawKFromBufferWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), n);
     }
 
     services::Status computeResults(const dtrees::internal::Tree & t);
@@ -683,16 +692,18 @@ services::Status TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, c
     _aFeatureBuf.reset(_nFeatureBufs);
     _aFeatureIndexBuf.reset(_nFeatureBufs);
 
-    if (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures)
-    {
-        _aFeatureIdx.reset(maxFeatures * 2);      // maxFeatures elements are used by algorithm, others are used internally by generator
-        _aConstFeatureIdx.reset(maxFeatures * 2); // first maxFeatures elements are used for saving indices of constant features,
-                                                  // the other part are used for saving levels of this features
-        DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
-        services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), maxFeatures * 2);
-    }
-    else
-        _aFeatureIdx.reset(_nFeaturesPerNode * 2); // _nFeaturesPerNode elements are used by algorithm, others are used internally by generator
+    /* first maxFeatures entries serve as a buffer of drawn samples for node splitting */
+    /* second maxFeatures entries contains [0, ..., maxFeatures - 1] and is used to randomly draw indices */
+    _aFeatureIdx.reset(maxFeatures * 2);
+    _aConstFeatureIdx.reset(maxFeatures * 2);
+
+    DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
+    services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), 2 * maxFeatures);
+    // in order to use drawKFromBufferWithoutReplacement we need to initialize
+    // the buffer to contain all indices from [0, 1, ..., maxFeatures - 1]
+    DAAL_CHECK_MALLOC(_aFeatureIdx.get());
+    services::internal::service_memset_seq<IndexType, cpu>(_aFeatureIdx.get(), IndexType(0), maxFeatures);
+    services::internal::service_memset_incrementing<IndexType, cpu>(_aFeatureIdx.get() + maxFeatures, IndexType(0), maxFeatures);
 
     DAAL_CHECK_MALLOC(_aSample.get() && _helper.reset(_nSamples) && _helper.resetWeights(_nSamples) && _aFeatureBuf.get() && _aFeatureIndexBuf.get()
                       && _aFeatureIdx.get());
@@ -798,7 +809,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
 
     typename DataHelper::TSplitData split;
     IndexType iFeature;
-    if (findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights))
+
+    NodeSplitResult split_result = findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights);
+    DAAL_ASSERT(split_result.status.ok());
+    if (split_result.bSplitSucceeded)
     {
         const size_t nLeft   = split.nLeft;
         const double imp     = curImpurity.var;
@@ -844,6 +858,7 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
         DAAL_ASSERT(split.nLeft == right->count);
         return res;
     }
+
     return makeLeaf(_aSample.get() + iStart, n, curImpurity, nClasses);
 }
 
@@ -859,7 +874,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
     {
         return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
     }
-    else if (findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights))
+
+    NodeSplitResult split_result = findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights);
+    DAAL_ASSERT(split_result.status.ok());
+    if (split_result.bSplitSucceeded)
     {
         const double imp     = impurity.var;
         const double impLeft = split.left.var;
@@ -896,10 +914,8 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
             return item.node;
         }
     }
-    else
-    {
-        return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
-    }
+
+    return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
 }
 
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
@@ -1032,37 +1048,40 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
 }
 
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
-bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
-                                                                                     const typename DataHelper::ImpurityData & curImpurity,
-                                                                                     IndexType & iFeatureBest,
-                                                                                     typename DataHelper::TSplitData & split)
+NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
+                                                                                                const typename DataHelper::ImpurityData & curImpurity,
+                                                                                                IndexType & iFeatureBest,
+                                                                                                typename DataHelper::TSplitData & split)
 {
+    services::Status st;
     RNGs<IndexType, cpu> rng;
     algorithmFPType featBuf[2];
     IndexType * aIdx = _aSample.get() + iStart;
     for (size_t i = 0; i < _nFeaturesPerNode; ++i)
     {
         IndexType iFeature;
         *_numElems += 1;
-        rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
+        int errorcode = rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
+        if (errorcode)
+        {
+            st = services::Status(services::ErrorNullResult);
+        }
         featureValuesToBuf(iFeature, featBuf, aIdx, 2);
         if (featBuf[1] - featBuf[0] <= _accuracy) //all values of the feature are the same
             continue;
         _helper.simpleSplit(featBuf, aIdx, split);
         split.featureUnordered = _featHelper.isUnordered(iFeature);
         split.impurityDecrease = curImpurity.var;
         iFeatureBest           = iFeature;
-        return true;
+        return { st, true };
     }
-    return false;
+    return { st, false };
 }
 
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
-bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(size_t level, size_t iStart, size_t n,
-                                                                                       const typename DataHelper::ImpurityData & curImpurity,
-                                                                                       IndexType & iFeatureBest,
-                                                                                       typename DataHelper::TSplitData & split,
-                                                                                       algorithmFPType totalWeights)
+NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(
+    size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
+    typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
 {
     if (n == 2)
     {
@@ -1078,26 +1097,67 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
 
 //find best split and put it to featureIndexBuf
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
-bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(size_t level, size_t iStart, size_t n,
-                                                                                             const typename DataHelper::ImpurityData & curImpurity,
-                                                                                             IndexType & iBestFeature,
-                                                                                             typename DataHelper::TSplitData & bestSplit,
-                                                                                             algorithmFPType totalWeights)
+NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(
+    size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
+    typename DataHelper::TSplitData & bestSplit, algorithmFPType totalWeights)
 {
-    chooseFeatures();
-    size_t nVisitedFeature       = 0;
-    const size_t maxFeatures     = nFeatures();
-    const float qMax             = 0.02; //min fracture of observations to be handled as indexed feature values
-    IndexType * bestSplitIdx     = featureIndexBuf(0) + iStart;
-    IndexType * aIdx             = _aSample.get() + iStart;
-    int iBestSplit               = -1;
-    int idxFeatureValueBestSplit = -1; //when sorted feature is used
+    services::Status st;
+
+    /* counter of the number of visited features, we visit _nFeaturesPerNode
+    *  depending on _useConstFeatures, constant features can be skipped
+    */
+    size_t nVisitedFeature = 0;
+    /* total number of features */
+    const size_t maxFeatures = nFeatures();
+    /* minimum fraction of all samples per bin */
+    const algorithmFPType qMax = 0.02;
+    /* index of the best split, initialized to first index we investigate */
+    IndexType * bestSplitIdx = featureIndexBuf(0) + iStart;
+    /* sample index */
+    IndexType * aIdx = _aSample.get() + iStart;
+    /* zero-based index of best split */
+    int64_t iBestSplit               = -1;
+    int64_t idxFeatureValueBestSplit = -1;
     typename DataHelper::TSplitData split;
-    const float fact = float(n);
+    /* RNG for sample drawing */
+    RNGs<IndexType, cpu> rng;
+    /* index for swapping samples in Fisher-Yates sampling */
+    IndexType swapIdx;
+
     for (size_t i = 0; i < maxFeatures && nVisitedFeature < _nFeaturesPerNode; ++i)
     {
-        const auto iFeature            = _aFeatureIdx[i];
-        const bool bUseIndexedFeatures = (!_par.memorySavingMode) && (fact > qMax * float(_helper.indexedFeatures().numIndices(iFeature)));
+        /* draw a random sample without replacement */
+        // based on Fisher Yates sampling
+        // _aFeatureIdx has length of 2 * _maxFeatures
+        // first maxFeatures contain the currently selected features
+        // at iteration i, we have drawn i features and written them to
+        // _aFeatureIdx[0, 1, ..., i-1]
+        //
+        // the second half of the buffer contains all numbers from
+        // [0, 1, ..., maxFeatures-1] and we randomly select one without
+        // replacement based on Fisher Yates sampling
+        // drawing uniformly from [0, maxFeatures-i] and swapping the indices
+        // assures uniform probability of all drawn numbers
+
+        /* draw the i-th index of the sample */
+        int errorcode = rng.uniform(1, &swapIdx, _engineImpl->getState(), 0, maxFeatures - i);
+        if (errorcode)
+        {
+            st = services::Status(services::ErrorNullResult);
+        }
+
+        /* account for buffer offset from 0 */
+        swapIdx += maxFeatures;
+        /* _aFeatureIdx[swapIdx] was drawn */
+        _aFeatureIdx[i] = _aFeatureIdx[swapIdx];
+        /* swap in number at [2 * maxFeatures - 1 - i] for next draw */
+        _aFeatureIdx[swapIdx] = _aFeatureIdx[2 * maxFeatures - 1 - i];
+        /* store drawn number at end of number buffer so that no number is lost */
+        _aFeatureIdx[2 * maxFeatures - 1 - i] = _aFeatureIdx[i];
+
+        const auto iFeature = _aFeatureIdx[i];
+        const bool bUseIndexedFeatures =
+            (!_par.memorySavingMode) && (algorithmFPType(n) > qMax * algorithmFPType(_helper.indexedFeatures().numIndices(iFeature)));
 
         if (!_maxLeafNodes && !_useConstFeatures && !_par.memorySavingMode)
         {
@@ -1154,7 +1214,14 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
 #endif
         }
     }
-    if (iBestSplit < 0) return false; //not found
+
+    if (!st.ok() || iBestSplit < 0)
+    {
+        // either:
+        // error during splitting -> failure
+        // or no split found -> not a failure but still have to return
+        return { st, false };
+    }
 
     iBestFeature    = _aFeatureIdx[iBestSplit];
     bool bCopyToIdx = true;
@@ -1193,20 +1260,18 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
         bCopyToIdx = (iBestSplit + 1 < _nFeaturesPerNode); //if iBestSplit is the last considered feature
                                                            //then aIdx already contains the best split, no need to copy
     if (bCopyToIdx) services::internal::tmemcpy<IndexType, cpu>(aIdx, bestSplitIdx, n);
-    return true;
+    return { st, true };
 }
 
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
-bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(size_t level, size_t iStart, size_t n,
-                                                                                               const typename DataHelper::ImpurityData & curImpurity,
-                                                                                               IndexType & iFeatureBest,
-                                                                                               typename DataHelper::TSplitData & split,
-                                                                                               algorithmFPType totalWeights)
+NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(
+    size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
+    typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
 {
     chooseFeatures();
     TArray<typename DataHelper::TSplitData, cpu> aFeatureSplit(_nFeaturesPerNode);
     //TODO, if parallel for features
-    return false;
+    return { services::Status(services::ErrorMethodNotSupported), false };
 }
 
 template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp b/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp
@@ -56,8 +56,8 @@ Parameter::Parameter()
       minWeightFractionInLeafNode(0.),
       minImpurityDecreaseInSplitNode(0.),
       maxLeafNodes(0),
-      minBinSize(5),
-      maxBins(256)
+      maxBins(256),
+      minBinSize(5)
 {}
 } // namespace interface2
 Status checkImpl(const decision_forest::training::interface2::Parameter & prm)
diff --git a/cpp/daal/src/externals/service_memory.h b/cpp/daal/src/externals/service_memory.h
@@ -127,6 +127,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)
     return ptr;
 }
 
+/* Initialize block of memory of length num value */
 template <typename T, CpuType cpu>
 void service_memset_seq(T * const ptr, const T value, const size_t num)
 {
@@ -138,6 +139,18 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)
     }
 }
 
+/* Initialize block of memory of length num with entries [startValue, ..., startValue + num -1]*/
+template <typename T, CpuType cpu>
+void service_memset_incrementing(T * const ptr, const T startValue, const size_t num)
+{
+    PRAGMA_IVDEP
+    PRAGMA_VECTOR_ALWAYS
+    for (size_t i = 0; i < num; i++)
+    {
+        ptr[i] = startValue + i;
+    }
+}
+
 } // namespace internal
 } // namespace services
 } // namespace daal
diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h

Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)`
`127`	`127`	`return ptr;`
`128`	`128`	`}`
`129`	`129`
	`130`	`+/* Initialize block of memory of length num value */`
`130`	`131`	`template <typename T, CpuType cpu>`
`131`	`132`	`void service_memset_seq(T * const ptr, const T value, const size_t num)`
`132`	`133`	`{`
`@@ -138,6 +139,18 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)`
`138`	`139`	`}`
`139`	`140`	`}`
`140`	`141`
	`142`	`+/* Initialize block of memory of length num with entries [startValue, ..., startValue + num -1]*/`
	`143`	`+template <typename T, CpuType cpu>`
	`144`	`+void service_memset_incrementing(T * const ptr, const T startValue, const size_t num)`
	`145`	`+{`
	`146`	`+ PRAGMA_IVDEP`
	`147`	`+ PRAGMA_VECTOR_ALWAYS`
	`148`	`+ for (size_t i = 0; i < num; i++)`
	`149`	`+ {`
	`150`	`+ ptr[i] = startValue + i;`
	`151`	`+ }`
	`152`	`+}`
	`153`	`+`
`141`	`154`	`} // namespace internal`
`142`	`155`	`} // namespace services`
`143`	`156`	`} // namespace daal`