Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement sumWithSIMD using xsimd #3099

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cpp/daal/src/data_management/finiteness_checker_avx2_impl.i
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#ifndef __FINITENESS_CHECKER_AVX2_IMPL_I__
#define __FINITENESS_CHECKER_AVX2_IMPL_I__

#ifndef ONEDAL_XSIMD_ENABLED

/*
// Computes sum of the elements of input array of type `float` with AVX2 instructions.
*/
Expand Down Expand Up @@ -76,6 +78,8 @@ double sumWithSIMD<double, avx2>(size_t n, const double * dataPtr)
return sum;
}

#endif // ONEDAL_XSIMD_ENABLED

template <>
float computeSum<float, avx2>(size_t nDataPtrs, size_t nElementsPerPtr, const float ** dataPtrs)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#ifndef __FINITENESS_CHECKER_AVX512_IMPL_I__
#define __FINITENESS_CHECKER_AVX512_IMPL_I__

#ifndef ONEDAL_XSIMD_ENABLED

/*
// Computes sum of the elements of input array of type `float` with AVX512 instructions.
*/
Expand Down Expand Up @@ -60,6 +62,8 @@ double sumWithSIMD<double, avx512>(size_t n, const double * dataPtr)
return sum;
}

#endif // ONEDAL_XSIMD_ENABLED

template <>
float computeSum<float, avx512>(size_t nDataPtrs, size_t nElementsPerPtr, const float ** dataPtrs)
{
Expand Down
58 changes: 58 additions & 0 deletions cpp/daal/src/data_management/finiteness_checker_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@

#include "finiteness_checker_impl.i"

#ifdef ONEDAL_XSIMD_ENABLED

namespace xs = xsimd;

#endif

namespace daal
{
namespace data_management
Expand Down Expand Up @@ -58,6 +64,57 @@ DataType getInf()
template <typename DataType, daal::CpuType cpu>
DataType sumWithSIMD(size_t n, const DataType * dataPtr);

#ifdef ONEDAL_XSIMD_ENABLED

/*
// Computes sum of the elements of input array using XSIMD.
//
// @tparam DataType Data type of the input array
// @tparam XSIMDArch XSIMD CPU architecture
//
// @param[in] n Number of elements in the input array
// @param[in] dataPtr Pointer to the input array
//
// @return Sum of the elements of the input array
*/
template <typename DataType, typename XSIMDArch>
DataType sumWithXSIMD(size_t n, const DataType * dataPtr)
{
constexpr size_t nPerInstr = xs::batch<DataType, XSIMDArch>::size;
DataType sum;

xs::batch<DataType, XSIMDArch> xs_sums(0.0);
const DataType * curDataPtr = dataPtr;
const size_t iEnd = n / nPerInstr;
for (size_t i = 0; i < iEnd; i++, curDataPtr += nPerInstr)
{
xs::batch<DataType, XSIMDArch> xs_data = xs::load_unaligned(curDataPtr);
xs_sums += xs_data;
}
sum = xs::reduce_add(xs_sums);

for (size_t i = iEnd * nPerInstr; i < n; ++i) sum += dataPtr[i];

return sum;
}


#if (__CPUID__(DAAL_CPU) != __sse2__)

template <>
float sumWithSIMD<float, DAAL_CPU>(size_t n, const float * dataPtr) {
return sumWithXSIMD<float, ONEDAL_XSIMD_ARCH>(n, dataPtr);
}

template <>
double sumWithSIMD<double, DAAL_CPU>(size_t n, const double * dataPtr) {
return sumWithXSIMD<double, ONEDAL_XSIMD_ARCH>(n, dataPtr);
}

#endif

#endif

/*
// Computes multi-threaded sum of a numeric table via summation using SIMD calls
*/
Expand Down Expand Up @@ -266,6 +323,7 @@ bool checkFinitenessSOASIMD(NumericTable & table, bool allowNaN, services::Statu

return valuesAreFinite;
}

#if (__CPUID__(DAAL_CPU) == __avx512__)

#include "finiteness_checker_avx512_impl.i"
Expand Down
4 changes: 4 additions & 0 deletions cpp/daal/src/data_management/finiteness_checker_sve_impl.i
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include <arm_sve.h>

#ifndef ONEDAL_XSIMD_ENABLED

/*
// Computes sum of the elements of input array of type `float` with sve instructions.
*/
Expand Down Expand Up @@ -80,6 +82,8 @@ double sumWithSIMD<double, sve>(size_t n, const double * dataPtr)
return sum;
}

#endif // ONEDAL_XSIMD_ENABLED

template <>
float computeSum<float, sve>(size_t nDataPtrs, size_t nElementsPerPtr, const float ** dataPtrs)
{
Expand Down
38 changes: 38 additions & 0 deletions cpp/daal/src/services/service_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
#include <stdint.h>
#include "services/env_detect.h"


#ifdef ONEDAL_XSIMD_ENABLED

#include "xsimd/xsimd.hpp"

#endif

DAAL_EXPORT int __daal_serv_cpu_detect(int);

void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd);
Expand Down Expand Up @@ -155,6 +162,37 @@ enum DataFormat
#define __CPUID__(cpu) __GLUE__(CPU_, cpu)
#define __FPTYPE__(type) __GLUE__(FPTYPE_, type)


#ifdef ONEDAL_XSIMD_ENABLED

#if (__CPUID__(DAAL_CPU) == __avx512__)

#define ONEDAL_XSIMD_ARCH xsimd::avx512bw

#elif (__CPUID__(DAAL_CPU) == __avx2__)

#define ONEDAL_XSIMD_ARCH xsimd::fma3<xsimd::avx2>

#elif (__CPUID__(DAAL_CPU) == __sse42__)

#define ONEDAL_XSIMD_ARCH xsimd::sse4_2

#elif (__CPUID__(DAAL_CPU) == __sse2__)

#define ONEDAL_XSIMD_ARCH xsimd::sse2

#elif (__CPUID__(DAAL_CPU) == __sve__)

#if (__FPTYPE__(DAAL_DATA_TYPE) == __float__)
#define ONEDAL_XSIMD_ARCH xsimd::sve<16>
#else
#define ONEDAL_XSIMD_ARCH xsimd::sve<8>
#endif

#endif // __CPUID__(DAAL_CPU)

#endif

/*
// Set of macro definitions
// for FP values bit fields easy access
Expand Down
13 changes: 10 additions & 3 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,13 @@ ifeq ($(REQPROFILE), yes)
VTUNESDK.LIBS_A := $(if $(OS_is_lnx), $(VTUNESDK.libia)/libittnotify.a,)
endif

#================================= XSIMD folders ========================================

ifeq ($(REQXSIMD), yes)
-DXSIMD_ENABLED := -DONEDAL_XSIMD_ENABLED
XSIMD.include := $(XSIMD_DIR)/include
endif

#===============================================================================
# Release library names
#===============================================================================
Expand Down Expand Up @@ -447,7 +454,7 @@ CORE.srcdirs := $(CORE.SERV.srcdir) $(CORE.srcdir) \
$(CPPDIR.daal)/src/data_management

CORE.incdirs.common := $(RELEASEDIR.include) $(CPPDIR.daal) $(WORKDIR)
CORE.incdirs.thirdp := $(daaldep.math_backend.incdir) $(VTUNESDK.include) $(TBBDIR.include)
CORE.incdirs.thirdp := $(daaldep.math_backend.incdir) $(VTUNESDK.include) $(TBBDIR.include) $(XSIMD.include)
CORE.incdirs := $(CORE.incdirs.common) $(CORE.incdirs.thirdp)

$(info CORE.incdirs: $(CORE.incdirs))
Expand Down Expand Up @@ -505,7 +512,7 @@ $(WORKDIR.lib)/$(core_y): $(daaldep.math_backend.ext) $(VTUNES
$(CORE.tmpdir_y)/$(core_y:%.$y=%_link.txt) ; $(LINK.DYNAMIC) ; $(LINK.DYNAMIC.POST)

$(CORE.objs_a): $(CORE.tmpdir_a)/inc_a_folders.txt
$(CORE.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64) $(-DPROFILER)
$(CORE.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64) $(-DPROFILER) $(-DXSIMD_ENABLED)
$(CORE.objs_a): COPT += -D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \
-DDAAL_HIDE_DEPRECATED -DTBB_USE_ASSERT=0 -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
$(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG)
Expand All @@ -514,7 +521,7 @@ $(CORE.objs_a): COPT += @$(CORE.tmpdir_a)/inc_a_folders.txt
$(eval $(call append_uarch_copt,$(CORE.objs_a)))

$(CORE.objs_y): $(CORE.tmpdir_y)/inc_y_folders.txt
$(CORE.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64) $(-DPROFILER)
$(CORE.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64) $(-DPROFILER) $(-DXSIMD_ENABLED)
$(CORE.objs_y): COPT += -D__DAAL_IMPLEMENTATION \
-D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \
-DDAAL_HIDE_DEPRECATED -DTBB_USE_ASSERT=0 -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
Expand Down
Loading