Skip to content

Commit 70797ec

Browse files
author
Arnaud De-Mattia
committed
check for __FMA__ in case of SSE/AVX
1 parent 63d8bdc commit 70797ec

10 files changed

+33
-20
lines changed

common.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ ifeq ($(DO_CHECKS), 1)
331331
CFLAGS += -funroll-loops
332332
CFLAGS += -march=native -fno-strict-aliasing
333333
CFLAGS += -Wformat=2 -Wpacked -Wnested-externs -Wpointer-arith -Wredundant-decls -Wfloat-equal -Wcast-qual
334-
CFLAGS += -Wcast-align -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wstrict-prototypes #-D_POSIX_C_SOURCE=2 -Wpadded -Wconversion
334+
CFLAGS += -Wcast-align -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wstrict-prototypes #-D_POSIX_C_SOURCE=2 -Wpadded -Wconversion
335335
CFLAGS += -Wno-unused-local-typedefs ## to suppress the unused typedef warning for the compile time assert for sizeof(struct config_options)
336336

337337
# if TESTS are being run then add the -fsanitize options

theory/DD/countpairs_kernels.c.src

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ static inline int countpairs_avx512_intrinsics_DOUBLE(const int64_t N0, DOUBLE *
4949
}
5050
const int32_t need_rpavg = src_rpavg != NULL;
5151
const int32_t need_weightavg = src_weightavg != NULL;
52-
const DOUBLE sqr_rpmin=rpmin*rpmin, sqr_rpmax=rpmax*rpmax;
52+
const DOUBLE sqr_rpmax=rpmax*rpmax;
5353
AVX512_FLOATS m_inv_rpstep = AVX512_SETZERO_FLOAT();
5454
AVX512_FLOATS m_rpmin_invstep = AVX512_SETZERO_FLOAT();
5555
if (bin_type == BIN_LIN) {

theory/DDrppi/countpairs_rp_pi_kernels.c.src

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static inline int countpairs_rp_pi_avx512_intrinsics_DOUBLE(const int64_t N0, DO
4343

4444
const int32_t need_rpavg = src_rpavg != NULL;
4545
const int32_t need_weightavg = src_weightavg != NULL;
46-
const DOUBLE sqr_rpmin=rpmin*rpmin, sqr_rpmax=rpmax*rpmax;
46+
const DOUBLE sqr_rpmax=rpmax*rpmax;
4747
AVX512_FLOATS m_inv_rpstep = AVX512_SETZERO_FLOAT();
4848
AVX512_FLOATS m_rpmin_invstep = AVX512_SETZERO_FLOAT();
4949
if (bin_type == BIN_LIN) {

theory/DDsmu/countpairs_s_mu_kernels.c.src

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static inline int countpairs_s_mu_avx512_intrinsics_DOUBLE(const int64_t N0, DOU
4343

4444
const int32_t need_savg = src_savg != NULL;
4545
const int32_t need_weightavg = src_weightavg != NULL;
46-
const DOUBLE sqr_smin=smin*smin, sqr_smax=smax*smax;
46+
const DOUBLE sqr_smax=smax*smax;
4747
AVX512_FLOATS m_inv_sstep = AVX512_SETZERO_FLOAT();
4848
AVX512_FLOATS m_smin_invstep = AVX512_SETZERO_FLOAT();
4949
if (bin_type == BIN_LIN) {

theory/wp/wp_kernels.c.src

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ static inline int wp_avx512_intrinsics_DOUBLE(DOUBLE *x0, DOUBLE *y0, DOUBLE *z0
4848
}
4949
const int32_t need_rpavg = src_rpavg != NULL;
5050
const int32_t need_weightavg = src_weightavg != NULL;
51-
const DOUBLE sqr_rpmin=rpmin*rpmin, sqr_rpmax=rpmax*rpmax;
51+
const DOUBLE sqr_rpmax=rpmax*rpmax;
5252
AVX512_FLOATS m_inv_rpstep = AVX512_SETZERO_FLOAT();
5353
AVX512_FLOATS m_rpmin_invstep = AVX512_SETZERO_FLOAT();
5454
if (bin_type == BIN_LIN) {

theory/xi/xi_kernels.c.src

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ static inline int xi_avx512_intrinsics_DOUBLE(DOUBLE *x1, DOUBLE *y1, DOUBLE *z1
5050
}
5151
const int32_t need_rpavg = src_rpavg != NULL;
5252
const int32_t need_weightavg = src_weightavg != NULL;
53-
const DOUBLE sqr_rmin=rmin*rmin, sqr_rmax=rmax*rmax;
53+
const DOUBLE sqr_rmax=rmax*rmax;
5454
AVX512_FLOATS m_inv_rpstep = AVX512_SETZERO_FLOAT();
5555
AVX512_FLOATS m_rpmin_invstep = AVX512_SETZERO_FLOAT();
5656
if (bin_type == BIN_LIN) {

utils/avx512_calls.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const uint16_t masks_per_misalignment_value_float[] = {0b1111111111111111,
2828
0b0111111111111111};
2929

3030
const int64_t bits_set_in_avx512_mask_double[] = { B8(0) };
31-
const uint8_t masks_per_misalignment_value_double[] = {0b11111111,
31+
const uint8_t masks_per_misalignment_value_double[] = {0b11111111,
3232
0b00000001,
3333
0b00000011,
3434
0b00000111,

utils/avx512_calls.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ extern "C" {
111111
#define AVX512_MASK_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_mask_fmadd_ps(X, MASK, Y, Z)
112112
#define AVX512_MASKZ_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_maskz_fmadd_ps(MASK, X, Y, Z)
113113
#define AVX512_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm512_fmadd_round_ps(X, Y, Z, _MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
114-
#define AVX512_MASKZ_FMA_ADD_TRUNCATE_FLOATS(MASK, X,Y,Z) _mm512_fmadd_maskz_round_ps(MASK, X, Y, Z, _MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
114+
#define AVX512_MASKZ_FMA_ADD_TRUNCATE_FLOATS(MASK, X,Y,Z) _mm512_maskz_fmadd_round_ps(MASK, X, Y, Z, _MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
115115

116116
/* returns Z - XY*/
117117
#define AVX512_FNMA_ADD_FLOATS(X, Y, Z) _mm512_fnmadd_ps(X, Y, Z)
@@ -243,8 +243,8 @@ extern "C" {
243243
#define AVX512_SUBTRACT_FLOATS(X,Y) _mm512_sub_pd(X,Y)
244244
#define AVX512_MASK_SUBTRACT_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_sub_pd(FALSEVALS, MASK, X,Y)
245245
#define AVX512_MASKZ_SUBTRACT_FLOATS(MASK, X,Y) _mm512_maskz_sub_pd(MASK, X,Y)
246-
247-
246+
#define AVX512_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm512_fmadd_round_pd(X, Y, Z, _MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
247+
#define AVX512_MASKZ_FMA_ADD_TRUNCATE_FLOATS(MASK, X,Y,Z) _mm512_maskz_fmadd_round_pd(MASK, X, Y, Z, _MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
248248

249249
/* returns Z + XY*/
250250
#define AVX512_FMA_ADD_FLOATS(X,Y,Z) _mm512_fmadd_pd(X,Y,Z)

utils/avx_calls.h

+12-4
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,12 @@ extern "C" {
6262
#define AVX_SET_FLOAT(X) _mm256_set1_ps(X)
6363

6464
/* returns Z + XY*/
65+
#ifdef __FMA__
6566
#define AVX_FMA_ADD_FLOATS(X,Y,Z) _mm256_fmadd_ps(X,Y,Z)
66-
#define AVX_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm256_round_ps(_mm256_fmadd_ps(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
67+
#else
68+
#define AVX_FMA_ADD_FLOATS(X,Y,Z) _mm256_add_ps(_mm256_mul_ps(X,Y),Z)
69+
#endif
70+
#define AVX_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm256_round_ps(AVX_FMA_ADD_FLOATS(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
6771

6872
// X OP Y
6973
#define AVX_COMPARE_FLOATS(X,Y,OP) _mm256_cmp_ps(X,Y,OP)
@@ -82,11 +86,11 @@ extern "C" {
8286
#ifdef __INTEL_COMPILER
8387
#define AVX_ARC_COSINE(X, order) _mm256_acos_ps(X)
8488
#else
85-
//Other compilers do not have the vectorized arc-cosine
89+
//Other compilers do not have the vectorized arc-cosine
8690
#define AVX_ARC_COSINE(X, order) inv_cosine_avx(X, order)
8791
#endif
8892

89-
//Max
93+
//Max
9094
#define AVX_MAX_FLOATS(X,Y) _mm256_max_ps(X,Y)
9195

9296

@@ -127,8 +131,12 @@ extern "C" {
127131
#define AVX_RECIPROCAL_FLOATS(X) _mm256_rcp_pd(X)
128132

129133
/* returns Z + XY*/
134+
#ifdef __FMA__
130135
#define AVX_FMA_ADD_FLOATS(X,Y,Z) _mm256_fmadd_pd(X,Y,Z)
131-
#define AVX_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm256_round_pd(_mm256_fmadd_pd(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
136+
#else
137+
#define AVX_FMA_ADD_FLOATS(X,Y,Z) _mm256_add_pd(_mm256_mul_pd(X,Y),Z)
138+
#endif
139+
#define AVX_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm256_round_pd(AVX_FMA_ADD_FLOATS(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
132140

133141
// X OP Y
134142
#define AVX_COMPARE_FLOATS(X,Y,OP) _mm256_cmp_pd(X,Y,OP)

utils/sse_calls.h

+11-6
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,12 @@ extern "C" {
6161
#define SSE_ABS_FLOAT(X) _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), X), X)
6262

6363
/* returns Z + XY*/
64+
#ifdef __FMA__
6465
#define SSE_FMA_ADD_FLOATS(X,Y,Z) _mm_fmadd_ps(X,Y,Z)
65-
//#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_fmadd_round_ss(X,Y,Z,_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
66-
#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_round_ps(_mm_fmadd_ps(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
67-
66+
#else
67+
#define SSE_FMA_ADD_FLOATS(X,Y,Z) _mm_add_ps(_mm_mul_ps(X,Y),Z)
68+
#endif
69+
#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_round_ps(SSE_FMA_ADD_FLOATS(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
6870

6971
#ifdef __INTEL_COMPILER
7072
#define SSE_ARC_COSINE(X, order) _mm_acos_ps(X)
@@ -122,9 +124,12 @@ extern "C" {
122124
#define SSE_ABS_FLOAT(X) _mm_max_pd(_mm_sub_pd(_mm_setzero_pd(), X), X)
123125

124126
/* returns Z + XY*/
125-
#define SSE_FMA_ADD_FLOATS(X,Y,Z) _mm_fmadd_pd(X,Y,Z)
126-
//#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_fmadd_round_sd(X,Y,Z,_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
127-
#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_round_pd(_mm_fmadd_pd(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
127+
#ifdef __FMA__
128+
#define SSE_FMA_ADD_FLOATS(X,Y,Z) _mm_fmadd_pd(X,Y,Z)
129+
#else
130+
#define SSE_FMA_ADD_FLOATS(X,Y,Z) _mm_add_pd(_mm_mul_pd(X,Y),Z)
131+
#endif
132+
#define SSE_FMA_ADD_TRUNCATE_FLOATS(X,Y,Z) _mm_round_pd(SSE_FMA_ADD_FLOATS(X,Y,Z),_MM_FROUND_TO_ZERO|_MM_FROUND_NO_EXC)
128133

129134
#endif
130135

0 commit comments

Comments
 (0)