[enhancement] SVE Implementation of sumWithSIMD (#3079)

* sve implementation of sumWithSIMD * compile fix * add missing file * clang-fix * Update cpp/daal/src/data_management/finiteness_checker_sve_impl.i Co-authored-by: Victoriya Fedotova <viktoria.nn@gmail.com> * Update cpp/daal/src/data_management/finiteness_checker_sve_impl.i Co-authored-by: Victoriya Fedotova <viktoria.nn@gmail.com> * Update cpp/daal/src/data_management/finiteness_checker_sve_impl.i Co-authored-by: Victoriya Fedotova <viktoria.nn@gmail.com> --------- Co-authored-by: Victoriya Fedotova <viktoria.nn@gmail.com>
uxlfoundation · Feb 28, 2025 · 7cc1c9b · 7cc1c9b
1 parent 7884821
commit 7cc1c9b
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 3 deletions.
diff --git a/cpp/daal/src/data_management/finiteness_checker_cpu.cpp b/cpp/daal/src/data_management/finiteness_checker_cpu.cpp
@@ -35,7 +35,7 @@ namespace internal
 {
 using namespace daal::internal;
 
-#if defined(DAAL_INTEL_CPP_COMPILER)
+#if defined(DAAL_INTEL_CPP_COMPILER) || (__CPUID__(DAAL_CPU) == __sve__)
 
 const size_t BLOCK_SIZE       = 8192;
 const size_t THREADING_BORDER = 262144;
@@ -51,7 +51,7 @@ DataType getInf()
     return inf;
 }
 
-// These functions are used for both AVX2 and AVX512
+// These functions are used for AVX2, AVX512 and SVE
 // and are therefore outside of their separate
 // implementations
 
@@ -84,6 +84,7 @@ DataType computeSumSIMD(size_t nDataPtrs, size_t nElementsPerPtr, const DataType
         size_t end           = blockIdxInPtr == nBlocksPerPtr - 1 ? start + nPerBlock + nSurplus : start + nPerBlock;
 
         //sumWithSIMD defined for AVX2 and AVX512 in finiteness_checker_avx2_impl.i and finiteness_checker_avx512_impl.i
+        //sumWithSIMD defined for SVE in finiteness_checker_sve_impl.i
         pSums[iBlock] = sumWithSIMD<DataType, cpu>(end - start, dataPtrs[ptrIdx] + start);
     });
 
@@ -161,6 +162,17 @@ double computeSumSOASIMD(NumericTable & table, bool & sumIsFinite, services::Sta
 
     return sum;
 }
+    #if defined(TARGET_ARM)
+        #if (__CPUID__(DAAL_CPU) == __sve__)
+
+            #include "finiteness_checker_sve_impl.i"
+
+        #endif // __CPUID__(DAAL_CPU) == __sve__
+    #endif
+
+#endif
+
+#if defined(DAAL_INTEL_CPP_COMPILER)
 
 template <daal::CpuType cpu>
 services::Status checkFinitenessInBlocks(const float ** dataPtrs, bool inParallel, size_t nTotalBlocks, size_t nBlocksPerPtr, size_t nPerBlock,
@@ -254,7 +266,6 @@ bool checkFinitenessSOASIMD(NumericTable & table, bool allowNaN, services::Statu
 
     return valuesAreFinite;
 }
-
     #if (__CPUID__(DAAL_CPU) == __avx512__)
 
         #include "finiteness_checker_avx512_impl.i"

diff --git a/cpp/daal/src/data_management/finiteness_checker_sve_impl.i b/cpp/daal/src/data_management/finiteness_checker_sve_impl.i
@@ -0,0 +1,106 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+/*
+ * Contains SVE optimizations for sumWithSIMD algorithm.
+ */
+
+#ifndef __FINITENESS_CHECKER_SVE_IMPL_I__
+#define __FINITENESS_CHECKER_SVE_IMPL_I__
+
+#include <arm_sve.h>
+
+/*
+// Computes sum of the elements of input array of type `float` with sve instructions.
+*/
+template <>
+float sumWithSIMD<float, sve>(size_t n, const float * dataPtr)
+{
+    float sum        = 0.0f;
+    svfloat32_t sums = svdup_f32(0.0f); // Vector register initialized to zero
+
+    // Pointer to the data
+    const float * ptr = dataPtr;
+    size_t i          = 0;
+
+    // Single loop that handles both full and remainder elements
+    svbool_t pg = svwhilelt_b32(i, n);
+    while (svptest_any(svptrue_b32(), pg))
+    {                                                   // Check if there's any active lane
+        svfloat32_t data = svld1_f32(pg, &ptr[i]);      // Load elements
+        sums             = svadd_f32_x(pg, sums, data); // Vector sum
+        i += svcntw();                                  // Advance by number of elements processed
+        pg = svwhilelt_b32(i, n);                       // Update predicate for next iteration
+    }
+
+    // Horizontal sum
+    sum = svaddv_f32(svptrue_b32(), sums);
+
+    return sum;
+}
+
+/*
+// Computes sum of the elements of input array of type `double` with sve instructions.
+*/
+template <>
+double sumWithSIMD<double, sve>(size_t n, const double * dataPtr)
+{
+    double sum       = 0.0;
+    svfloat64_t sums = svdup_f64(0.0); // Vector register initialized to zero
+
+    // Pointer to the data
+    const double * ptr = dataPtr;
+    size_t i           = 0;
+
+    // Single loop that handles both full and remainder elements
+    svbool_t pg = svwhilelt_b64(i, n);
+    while (svptest_any(svptrue_b64(), pg))
+    {                                                   // Check if there's any active lane
+        svfloat64_t data = svld1_f64(pg, &ptr[i]);      // Load elements
+        sums             = svadd_f64_x(pg, sums, data); // Vector sum
+        i += svcntd();                                  // Advance by number of elements processed
+        pg = svwhilelt_b64(i, n);                       // Update predicate for next iteration
+    }
+
+    // Horizontal sum
+    sum = svaddv_f64(svptrue_b64(), sums);
+
+    return sum;
+}
+
+template <>
+float computeSum<float, sve>(size_t nDataPtrs, size_t nElementsPerPtr, const float ** dataPtrs)
+{
+    // computeSumSIMD defined in finiteness_checker_cpu.cpp
+    return computeSumSIMD<float, sve>(nDataPtrs, nElementsPerPtr, dataPtrs);
+}
+
+template <>
+double computeSum<double, sve>(size_t nDataPtrs, size_t nElementsPerPtr, const double ** dataPtrs)
+{
+    // computeSumSIMD defined in finiteness_checker_cpu.cpp
+    return computeSumSIMD<double, sve>(nDataPtrs, nElementsPerPtr, dataPtrs);
+}
+
+template <>
+double computeSumSOA<sve>(NumericTable & table, bool & sumIsFinite, services::Status & st)
+{
+    // computeSumSOASIMD defined in finiteness_checker_cpu.cpp
+    return computeSumSOASIMD<sve>(table, sumIsFinite, st);
+}
+
+//TODO: Implement checkFinitenessInBlocks()
+
+#endif // __FINITENESS_CHECKER_SVE_IMPL_I__