diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index e8208ba6..bf37c19d 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -261,89 +261,89 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, #endif /*LV_HAVE_NEON*/ -//#if LV_HAVE_AVX2 -// -//#include -//#include -// -// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, -// unsigned char* syms, -// unsigned int framebits) -//{ -// if (framebits < 12) { -// return; -// } -// -// static int once = 1; -// int d_numstates = (1 << 6); -// int rate = 2; -// static unsigned char* D; -// static unsigned char* Y; -// static unsigned char* X; -// static unsigned int excess = 6; -// static unsigned char* Branchtab; -// static unsigned char Partab[256]; -// -// int d_polys[2] = { 79, 109 }; -// -// -// if (once) { -// -// X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment()); -// Y = X + d_numstates; -// Branchtab = -// (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment()); -// D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), -// volk_get_alignment()); -// int state, i; -// int cnt, ti; -// -// /* Initialize parity lookup table */ -// for (i = 0; i < 256; i++) { -// cnt = 0; -// ti = i; -// while (ti) { -// if (ti & 1) -// cnt++; -// ti >>= 1; -// } -// Partab[i] = cnt & 1; -// } -// /* Initialize the branch table */ -// for (state = 0; state < d_numstates / 2; state++) { -// for (i = 0; i < rate; i++) { -// Branchtab[i * d_numstates / 2 + state] = -// parity((2 * state) & d_polys[i], Partab) ? 255 : 0; -// } -// } -// -// once = 0; -// } -// -// // unbias the old_metrics -// memset(X, 31, d_numstates); -// -// // initialize decisions -// memset(D, 0, (d_numstates / 8) * (framebits + 6)); -// -// volk_8u_x4_conv_k7_r2_8u_avx2( -// Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); -// -// unsigned int min = X[0]; -// int i = 0, state = 0; -// for (i = 0; i < (d_numstates); ++i) { -// if (X[i] < min) { -// min = X[i]; -// state = i; -// } -// } -// -// chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); -// -// return; -//} -// -//#endif /*LV_HAVE_AVX2*/ +#if LV_HAVE_AVX2 + +#include +#include + +static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, + unsigned char* syms, + unsigned int framebits) +{ + if (framebits < 12) { + return; + } + + static int once = 1; + int d_numstates = (1 << 6); + int rate = 2; + static unsigned char* D; + static unsigned char* Y; + static unsigned char* X; + static unsigned int excess = 6; + static unsigned char* Branchtab; + static unsigned char Partab[256]; + + int d_polys[2] = { 79, 109 }; + + + if (once) { + + X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment()); + Y = X + d_numstates; + Branchtab = + (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment()); + D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), + volk_get_alignment()); + int state, i; + int cnt, ti; + + /* Initialize parity lookup table */ + for (i = 0; i < 256; i++) { + cnt = 0; + ti = i; + while (ti) { + if (ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + /* Initialize the branch table */ + for (state = 0; state < d_numstates / 2; state++) { + for (i = 0; i < rate; i++) { + Branchtab[i * d_numstates / 2 + state] = + parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + } + } + + once = 0; + } + + // unbias the old_metrics + memset(X, 31, d_numstates); + + // initialize decisions + memset(D, 0, (d_numstates / 8) * (framebits + 6)); + + volk_8u_x4_conv_k7_r2_8u_avx2( + Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); + + unsigned int min = X[0]; + int i = 0, state = 0; + for (i = 0; i < (d_numstates); ++i) { + if (X[i] < min) { + min = X[i]; + state = i; + } + } + + chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); + + return; +} + +#endif /*LV_HAVE_AVX2*/ #if LV_HAVE_GENERIC diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 14593a80..161b9cd4 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -114,209 +114,201 @@ static inline void BFLY(int i, } -//#if LV_HAVE_AVX2 -// -//#include -//#include -// -// static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, -// unsigned char* X, -// unsigned char* syms, -// unsigned char* dec, -// unsigned int framebits, -// unsigned int excess, -// unsigned char* Branchtab) -//{ -// unsigned int i9; -// for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { -// unsigned char a75, a81; -// int a73, a92; -// int s20, s21; -// unsigned char *a80, *b6; -// int *a110, *a91, *a93; -// __m256i *a112, *a71, *a72, *a77, *a83, *a95; -// __m256i a86, a87; -// __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, -// m26, -// s18, s19, s22, s23, s24, s25, t13, t14, t15; -// a71 = ((__m256i*)X); -// s18 = *(a71); -// a72 = (a71 + 1); -// s19 = *(a72); -// s22 = _mm256_permute2x128_si256(s18, s19, 0x20); -// s19 = _mm256_permute2x128_si256(s18, s19, 0x31); -// s18 = s22; -// a73 = (4 * i9); -// b6 = (syms + a73); -// a75 = *(b6); -// a76 = _mm256_set1_epi8(a75); -// a77 = ((__m256i*)Branchtab); -// a78 = *(a77); -// a79 = _mm256_xor_si256(a76, a78); -// a80 = (b6 + 1); -// a81 = *(a80); -// a82 = _mm256_set1_epi8(a81); -// a83 = (a77 + 1); -// a84 = *(a83); -// a85 = _mm256_xor_si256(a82, a84); -// t13 = _mm256_avg_epu8(a79, a85); -// a86 = ((__m256i)t13); -// a87 = _mm256_srli_epi16(a86, 2); -// a88 = ((__m256i)a87); -// t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); -// t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); -// m23 = _mm256_adds_epu8(s18, t14); -// m24 = _mm256_adds_epu8(s19, t15); -// m25 = _mm256_adds_epu8(s18, t15); -// m26 = _mm256_adds_epu8(s19, t14); -// a89 = _mm256_min_epu8(m24, m23); -// d9 = _mm256_cmpeq_epi8(a89, m24); -// a90 = _mm256_min_epu8(m26, m25); -// d10 = _mm256_cmpeq_epi8(a90, m26); -// s22 = _mm256_unpacklo_epi8(d9, d10); -// s23 = _mm256_unpackhi_epi8(d9, d10); -// s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); -// a91 = ((int*)dec); -// a92 = (4 * i9); -// a93 = (a91 + a92); -// *(a93) = s20; -// s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); -// a110 = (a93 + 1); -// *(a110) = s21; -// s22 = _mm256_unpacklo_epi8(a89, a90); -// s23 = _mm256_unpackhi_epi8(a89, a90); -// a95 = ((__m256i*)Y); -// s24 = _mm256_permute2x128_si256(s22, s23, 0x20); -// *(a95) = s24; -// s23 = _mm256_permute2x128_si256(s22, s23, 0x31); -// a112 = (a95 + 1); -// *(a112) = s23; -// if ((((unsigned char*)Y)[0] > 210)) { -// __m256i m5, m6; -// m5 = ((__m256i*)Y)[0]; -// m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); -// __m256i m7; -// m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); -// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), -// ((__m256i)m7))); -// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), -// ((__m256i)m7))); -// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), -// ((__m256i)m7))); -// m7 = _mm256_unpacklo_epi8(m7, m7); -// m7 = _mm256_shufflelo_epi16(m7, 0); -// m6 = _mm256_unpacklo_epi64(m7, m7); -// m6 = _mm256_permute2x128_si256( -// m6, m6, 0); // copy lower half of m6 to upper half, since above ops -// // operate on 128 bit lanes -// ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); -// ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); -// } -// unsigned char a188, a194; -// int a205; -// int s48, s54; -// unsigned char *a187, *a193; -// int *a204, *a206, *a223, *b16; -// __m256i *a184, *a185, *a190, *a196, *a208, *a225; -// __m256i a199, a200; -// __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, -// m40, -// m41, m42, s46, s47, s50, s51, t25, t26, t27; -// a184 = ((__m256i*)Y); -// s46 = *(a184); -// a185 = (a184 + 1); -// s47 = *(a185); -// s50 = _mm256_permute2x128_si256(s46, s47, 0x20); -// s47 = _mm256_permute2x128_si256(s46, s47, 0x31); -// s46 = s50; -// a187 = (b6 + 2); -// a188 = *(a187); -// a189 = _mm256_set1_epi8(a188); -// a190 = ((__m256i*)Branchtab); -// a191 = *(a190); -// a192 = _mm256_xor_si256(a189, a191); -// a193 = (b6 + 3); -// a194 = *(a193); -// a195 = _mm256_set1_epi8(a194); -// a196 = (a190 + 1); -// a197 = *(a196); -// a198 = _mm256_xor_si256(a195, a197); -// t25 = _mm256_avg_epu8(a192, a198); -// a199 = ((__m256i)t25); -// a200 = _mm256_srli_epi16(a199, 2); -// a201 = ((__m256i)a200); -// t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); -// t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); -// m39 = _mm256_adds_epu8(s46, t26); -// m40 = _mm256_adds_epu8(s47, t27); -// m41 = _mm256_adds_epu8(s46, t27); -// m42 = _mm256_adds_epu8(s47, t26); -// a202 = _mm256_min_epu8(m40, m39); -// d17 = _mm256_cmpeq_epi8(a202, m40); -// a203 = _mm256_min_epu8(m42, m41); -// d18 = _mm256_cmpeq_epi8(a203, m42); -// s24 = _mm256_unpacklo_epi8(d17, d18); -// s25 = _mm256_unpackhi_epi8(d17, d18); -// s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); -// a204 = ((int*)dec); -// a205 = (4 * i9); -// b16 = (a204 + a205); -// a206 = (b16 + 2); -// *(a206) = s48; -// s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); -// a223 = (b16 + 3); -// *(a223) = s54; -// s50 = _mm256_unpacklo_epi8(a202, a203); -// s51 = _mm256_unpackhi_epi8(a202, a203); -// s25 = _mm256_permute2x128_si256(s50, s51, 0x20); -// s51 = _mm256_permute2x128_si256(s50, s51, 0x31); -// a208 = ((__m256i*)X); -// *(a208) = s25; -// a225 = (a208 + 1); -// *(a225) = s51; -// -// if ((((unsigned char*)X)[0] > 210)) { -// __m256i m12, m13; -// m12 = ((__m256i*)X)[0]; -// m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); -// __m256i m14; -// m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); -// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), -// ((__m256i)m14))); -// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), -// ((__m256i)m14))); -// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), -// ((__m256i)m14))); -// m14 = _mm256_unpacklo_epi8(m14, m14); -// m14 = _mm256_shufflelo_epi16(m14, 0); -// m13 = _mm256_unpacklo_epi64(m14, m14); -// m13 = _mm256_permute2x128_si256(m13, m13, 0); -// ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); -// ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); -// } -// } -// -// renormalize(X); -// -// unsigned int j; -// for (j = 0; j < (framebits + excess) % 2; ++j) { -// int i; -// for (i = 0; i < 64 / 2; i++) { -// BFLY(i, -// (((framebits + excess) >> 1) << 1) + j, -// syms, -// Y, -// X, -// (decision_t*)dec, -// Branchtab); -// } -// -// renormalize(Y); -// } -// /*skip*/ -//} -// -//#endif /*LV_HAVE_AVX2*/ +#if LV_HAVE_AVX2 + +#include +#include + +static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) +{ + unsigned int i9; + for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) { + unsigned char a75, a81; + int a73, a92; + int s20, s21; + unsigned char *a80, *b6; + int *a110, *a91, *a93; + __m256i *a112, *a71, *a72, *a77, *a83, *a95; + __m256i a86, a87; + __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26, + s18, s19, s22, s23, s24, s25, t13, t14, t15; + a71 = ((__m256i*)X); + s18 = *(a71); + a72 = (a71 + 1); + s19 = *(a72); + a73 = (4 * i9); + b6 = (syms + a73); + a75 = *(b6); + a76 = _mm256_set1_epi8(a75); + a77 = ((__m256i*)Branchtab); + a78 = *(a77); + a79 = _mm256_xor_si256(a76, a78); + a80 = (b6 + 1); + a81 = *(a80); + a82 = _mm256_set1_epi8(a81); + a83 = (a77 + 1); + a84 = *(a83); + a85 = _mm256_xor_si256(a82, a84); + t13 = _mm256_avg_epu8(a79, a85); + a86 = ((__m256i)t13); + a87 = _mm256_srli_epi16(a86, 2); + a88 = ((__m256i)a87); + t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); + t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); + m23 = _mm256_adds_epu8(s18, t14); + m24 = _mm256_adds_epu8(s19, t15); + m25 = _mm256_adds_epu8(s18, t15); + m26 = _mm256_adds_epu8(s19, t14); + a89 = _mm256_min_epu8(m24, m23); + d9 = _mm256_cmpeq_epi8(a89, m24); + a90 = _mm256_min_epu8(m26, m25); + d10 = _mm256_cmpeq_epi8(a90, m26); + s22 = _mm256_unpacklo_epi8(d9, d10); + s23 = _mm256_unpackhi_epi8(d9, d10); + s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); + a91 = ((int*)dec); + a92 = (4 * i9); + a93 = (a91 + a92); + *(a93) = s20; + s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); + a110 = (a93 + 1); + *(a110) = s21; + s22 = _mm256_unpacklo_epi8(a89, a90); + s23 = _mm256_unpackhi_epi8(a89, a90); + a95 = ((__m256i*)Y); + s24 = _mm256_permute2x128_si256(s22, s23, 0x20); + *(a95) = s24; + s23 = _mm256_permute2x128_si256(s22, s23, 0x31); + a112 = (a95 + 1); + *(a112) = s23; + + __m256i m5, m6; + m5 = ((__m256i*)Y)[0]; + m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]); + m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5)); + __m256i m7; + m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)), + ((__m256i)m7))); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)), + ((__m256i)m7))); + m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)), + ((__m256i)m7))); + m7 = _mm256_unpacklo_epi8(m7, m7); + m7 = _mm256_shufflelo_epi16(m7, 0); + m6 = _mm256_unpacklo_epi64(m7, m7); + m6 = _mm256_permute2x128_si256( + m6, m6, 0); // copy lower half of m6 to upper half, since above ops + // operate on 128 bit lanes + ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6); + ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6); + + unsigned char a188, a194; + int a205; + int s48, s54; + unsigned char *a187, *a193; + int *a204, *a206, *a223, *b16; + __m256i *a184, *a185, *a190, *a196, *a208, *a225; + __m256i a199, a200; + __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40, + m41, m42, s46, s47, s50, s51, t25, t26, t27; + a184 = ((__m256i*)Y); + s46 = *(a184); + a185 = (a184 + 1); + s47 = *(a185); + a187 = (b6 + 2); + a188 = *(a187); + a189 = _mm256_set1_epi8(a188); + a190 = ((__m256i*)Branchtab); + a191 = *(a190); + a192 = _mm256_xor_si256(a189, a191); + a193 = (b6 + 3); + a194 = *(a193); + a195 = _mm256_set1_epi8(a194); + a196 = (a190 + 1); + a197 = *(a196); + a198 = _mm256_xor_si256(a195, a197); + t25 = _mm256_avg_epu8(a192, a198); + a199 = ((__m256i)t25); + a200 = _mm256_srli_epi16(a199, 2); + a201 = ((__m256i)a200); + t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); + t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); + m39 = _mm256_adds_epu8(s46, t26); + m40 = _mm256_adds_epu8(s47, t27); + m41 = _mm256_adds_epu8(s46, t27); + m42 = _mm256_adds_epu8(s47, t26); + a202 = _mm256_min_epu8(m40, m39); + d17 = _mm256_cmpeq_epi8(a202, m40); + a203 = _mm256_min_epu8(m42, m41); + d18 = _mm256_cmpeq_epi8(a203, m42); + s24 = _mm256_unpacklo_epi8(d17, d18); + s25 = _mm256_unpackhi_epi8(d17, d18); + s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); + a204 = ((int*)dec); + a205 = (4 * i9); + b16 = (a204 + a205); + a206 = (b16 + 2); + *(a206) = s48; + s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a223 = (b16 + 3); + *(a223) = s54; + s50 = _mm256_unpacklo_epi8(a202, a203); + s51 = _mm256_unpackhi_epi8(a202, a203); + s25 = _mm256_permute2x128_si256(s50, s51, 0x20); + s51 = _mm256_permute2x128_si256(s50, s51, 0x31); + a208 = ((__m256i*)X); + *(a208) = s25; + a225 = (a208 + 1); + *(a225) = s51; + + __m256i m12, m13; + m12 = ((__m256i*)X)[0]; + m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]); + m12 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m12, m12, 0x21), m12)); + __m256i m14; + m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)), + ((__m256i)m14))); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)), + ((__m256i)m14))); + m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)), + ((__m256i)m14))); + m14 = _mm256_unpacklo_epi8(m14, m14); + m14 = _mm256_shufflelo_epi16(m14, 0); + m13 = _mm256_unpacklo_epi64(m14, m14); + m13 = _mm256_permute2x128_si256(m13, m13, 0); + ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13); + ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13); + } + + renormalize(X); + + unsigned int j; + for (j = 0; j < (framebits + excess) % 2; ++j) { + int i; + for (i = 0; i < 64 / 2; i++) { + BFLY(i, + (((framebits + excess) >> 1) << 1) + j, + syms, + Y, + X, + (decision_t*)dec, + Branchtab); + } + + renormalize(Y); + } + /*skip*/ +} + +#endif /*LV_HAVE_AVX2*/ #if LV_HAVE_SSE3