Index: SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512BW-") Index: SingleSource/UnitTests/Vector/AVX512BW/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512BW/Makefile +++ SingleSource/UnitTests/Vector/AVX512BW/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512BW/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512bw -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512bw +LCCFLAGS += -march=native -mavx512bw Index: SingleSource/UnitTests/Vector/AVX512BW/load_store.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BW/load_store.c +++ SingleSource/UnitTests/Vector/AVX512BW/load_store.c @@ -0,0 +1,267 @@ +/* + * Test load and store instructions. + * Here we check for _mm512_[mask|maskz]_[loadu|storeu] intrinsics. + */ +#include "m512_test_util.h" + +V512 src_vals[2]; +V512 all_ones; +volatile int vol0 = 0; + +void NOINLINE init() { + volatile int i; + int j; + + for (i = 0; i < sizeof(src_vals) / sizeof(src_vals[0]); i++) { + for (j = 0; j < 16; j++) { + src_vals[i].s32[j] = 16 * i + j; + } + } + + for (i = 0; i < 16; i++) { + all_ones.s32[i] = -1; + } +} + +void NOINLINE do_loadu() { + V512 res; + V512 expected; + __mmask64 k64 = 0xfbde79feffeeffee; + __mmask32 k32 = 0xbfde79fe; + __mmask16 k16 = 0xbfde; + __mmask8 k8 = 0xaf; + volatile int i; + signed char *p8 = &src_vals[0].s8[0]; + short *p16 = &src_vals[0].s16[0]; + int *p = &src_vals[0].s32[0]; + __int64 *p64 = &src_vals[0].s64[0]; + + res.zmm = _mm512_loadu_ps(&src_vals[0].s32[1]); + for (i = 0; i < 16; i++) { + expected.s32[i] = p[i + 1]; + } + check_equal_nd(&res, &expected, 16, "_mm512_loadu_ps", __LINE__); + + res.zmmd = _mm512_loadu_pd(&src_vals[0].s32[2]); + for (i = 0; i < 16; i++) { + expected.s32[i] = p[i + 2]; + } + check_equal_nd(&res, &expected, 16, "_mm512_loadu_pd", __LINE__); + + res.zmmi = _mm512_loadu_si512(&src_vals[0].s32[3]); + for (i = 0; i < 16; i++) { + expected.s32[i] = p[i + 3]; + } + check_equal_nd(&res, &expected, 16, "_mm512_loadu_si512", __LINE__); + + /* Now the write-masked versions. */ + + res = all_ones; + expected = all_ones; + res.zmm = _mm512_mask_loadu_ps(res.zmm, k16, &src_vals[0].s32[5]); + for (i = 0; i < 16; i++) { + if ((1 << i) & k16) { + expected.s32[i] = p[i + 5]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_ps", __LINE__); + + k64 += vol0; + res = all_ones; + expected = all_ones; + res.zmmi = _mm512_mask_loadu_epi8(res.zmmi, k64, &src_vals[0].s8[7]); + for (i = 0; i < 64; i++) { + if (((__mmask64)1 << i) & k64) { + expected.s8[i] = p8[i + 7]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi8", __LINE__); + + k64 += vol0; + res = all_ones; + expected.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_maskz_loadu_epi8(k64, &src_vals[0].s8[9]); + for (i = 0; i < 64; i++) { + if (((__mmask64)1 << i) & k64) { + expected.s8[i] = p8[i + 9]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_loadu_epi8", __LINE__); + + k32 += vol0; + res = all_ones; + expected = all_ones; + res.zmmi = _mm512_mask_loadu_epi16(res.zmmi, k32, &src_vals[0].s16[5]); + for (i = 0; i < 32; i++) { + if ((1 << i) & k32) { + expected.s16[i] = p16[i + 5]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi16", __LINE__); + + k32 += vol0; + res = all_ones; + expected.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_maskz_loadu_epi16(k32, &src_vals[0].s16[3]); + for (i = 0; i < 32; i++) { + if ((1 << i) & k32) { + expected.s16[i] = p16[i + 3]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_loadu_epi16", __LINE__); + + k16 = 0xabcd + vol0; + res = all_ones; + expected = all_ones; + res.zmmi = _mm512_mask_loadu_epi32(res.zmmi, k16, &src_vals[0].s32[7]); + for (i = 0; i < 16; i++) { + if ((1 << i) & k16) { + expected.s32[i] = p[i + 7]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi32", __LINE__); + + res = all_ones; + expected = all_ones; + res.zmmd = _mm512_mask_loadu_pd(res.zmmd, k8, &src_vals[0].s64[2]); + for (i = 0; i < 8; i++) { + if ((1 << i) & k8) { + expected.s64[i] = p64[i + 2]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_pd", __LINE__); + + k8 = 0x79 + vol0; + res = all_ones; + expected = all_ones; + res.zmmi = _mm512_mask_loadu_epi64(res.zmmi, k8, &src_vals[0].s64[3]); + for (i = 0; i < 8; i++) { + if ((1 << i) & k8) { + expected.s64[i] = p64[i + 3]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi64", __LINE__); +} + +void NOINLINE do_storeu() { + V512 src; + V512 expected; + volatile int i; + static V512 dst_vals[2]; + __mmask64 k64 = 0xabcdffffffffeebd; + __mmask32 k32 = 0xfefebdbd; + __mmask16 k16 = 0x79ab; + __mmask8 k8 = 0xea; + + src.zmmi = src_vals[0].zmmi; + + dst_vals[0].zmm = _mm512_setzero_ps(); + dst_vals[1].zmm = _mm512_setzero_ps(); + _mm512_storeu_si512(&dst_vals[0].s32[1], src.zmmi); + check_equal_nd(&dst_vals[0].s32[1], &src_vals, 16, "_mm512_storeu_si512", + __LINE__); + + dst_vals[0].zmm = _mm512_setzero_ps(); + dst_vals[1].zmm = _mm512_setzero_ps(); + _mm512_storeu_ps(&dst_vals[0].s32[2], src.zmm); + check_equal_nd(&dst_vals[0].s32[2], &src_vals, 16, "_mm512_storeu_pd", + __LINE__); + + dst_vals[0].zmm = _mm512_setzero_ps(); + dst_vals[1].zmm = _mm512_setzero_ps(); + _mm512_storeu_pd(&dst_vals[0].s32[4], src.zmmd); + check_equal_nd(&dst_vals[0].s32[4], &src_vals, 16, "_mm512_storeu_pd", + __LINE__); + + /* Now the write-masked versions. */ + + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_epi8(&dst_vals[0].s8[3], k64, src.zmmi); + expected = all_ones; + for (i = 0; i < 64; i++) { + if (((__mmask64)1 << i) & k64) { + expected.s8[i] = src.s8[i]; + } + } + check_equal_nd(&dst_vals[0].s8[3], &expected, 16, "_mm512_mask_storeu_epi8", + __LINE__); + + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_epi16(&dst_vals[0].s16[3], k32, src.zmmi); + expected = all_ones; + for (i = 0; i < 32; i++) { + if (((__mmask32)1 << i) & k32) { + expected.s16[i] = src.s16[i]; + } + } + check_equal_nd(&dst_vals[0].s16[3], &expected, 16, "_mm512_mask_storeu_epi16", + __LINE__); + + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_epi32(&dst_vals[0].s32[1], k16, src.zmmi); + expected = all_ones; + for (i = 0; i < 16; i++) { + if ((1 << i) & k16) { + expected.s32[i] = src.s32[i]; + } + } + check_equal_nd(&dst_vals[0].s32[1], &expected, 16, "_mm512_mask_storeu_epi32", + __LINE__); + + k16 = 0xdcba + vol0; + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_ps(&dst_vals[0].s32[3], k16, src.zmm); + expected = all_ones; + for (i = 0; i < 16; i++) { + if ((1 << i) & k16) { + expected.s32[i] = src.s32[i]; + } + } + check_equal_nd(&dst_vals[0].s32[3], &expected, 16, "_mm512_mask_storeu_ps", + __LINE__); + + k8 = 0xbc; + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_pd(&dst_vals[0].s64[3], k8, src.zmmd); + expected = all_ones; + for (i = 0; i < 8; i++) { + if ((1 << i) & k8) { + expected.s64[i] = src.s64[i]; + } + } + check_equal_nd(&dst_vals[0].s64[3], &expected, 16, "_mm512_mask_storeu_pd", + __LINE__); + + k8 = 0xcb + vol0; + dst_vals[0] = all_ones; + dst_vals[1] = all_ones; + _mm512_mask_storeu_epi64(&dst_vals[0].s64[1], k8, src.zmmi); + expected = all_ones; + for (i = 0; i < 8; i++) { + if ((1 << i) & k8) { + expected.s64[i] = src.s64[i]; + } + } + check_equal_nd(&dst_vals[0].s64[1], &expected, 16, "_mm512_mask_storeu_epi64", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_loadu(); + do_storeu(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output +++ SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c @@ -0,0 +1,234 @@ +/* + * Exercise intrinsics for a instructions which set mask register + * by values in vector registers and set vector register value by + * values in mask register. + */ + +#include "m512_test_util.h" + +__int64 calc_expected_mask_val(const char *valp, int el_size, int length) { + __int64 rval = 0; + int i; + + for (i = 0; i < length; i++) { + if ((valp[el_size * i + (el_size - 1)] & 0x80) != 0) { + rval |= (1LL << i); + } + } + + return rval; +} + +char *calc_expected_vec_val(__mmask64 mask_val, int mask_size, int el_size, + char *buf) { + int i, j; + + for (i = 0; i < mask_size * el_size; buf[i++] = 0) + ; + + for (i = 0; i < mask_size; i++) { + if ((mask_val & (1LL << i)) != 0) { + for (j = 0; j < el_size; j++) { + buf[i * el_size + j] = 0xff; + } + } + } + + return buf; +} + +NOINLINE void check_mask16(__mmask16 res_mask, __mmask16 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 16; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + } +} + +NOINLINE void check_xmm_arr(const __m128i xvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&xvar; + + if (memcmp((void *)p, (void *)buf, 16) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 16; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 16; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%04x\n", (int)(input)&0xffff); + n_errs++; + } +} + +NOINLINE void test_xmm(int shift, int mulp) { + ALIGNTO(16) char buf[16]; + int i; + __m128i xvar; + + for (i = 0; i < 16; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&xvar, buf, 16); + + check_mask16(_mm_movepi8_mask(xvar), calc_expected_mask_val(buf, 1, 16), + "_mm_movepi8_mask", buf); + check_mask16(_mm_movepi16_mask(xvar), calc_expected_mask_val(buf, 2, 8), + "_mm_movepi16_mask", buf); + + check_xmm_arr(_mm_movm_epi8((__mmask16)shift * mulp), + calc_expected_vec_val(shift * mulp, 16, 1, buf), + "_mm_movm_epi8", (__mmask16)shift * mulp); + check_xmm_arr(_mm_movm_epi16((__mmask16)shift * mulp), + calc_expected_vec_val(shift * mulp, 8, 2, buf), + "_mm_movm_epi16", (__mmask16)shift * mulp); +} + +NOINLINE void check_mask32(__mmask32 res_mask, __mmask32 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 32; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + exit(1); + } +} + +NOINLINE void check_ymm_arr(const __m256i yvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&yvar; + + if (memcmp((void *)p, (void *)buf, 32) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 32; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 32; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%04x\n", (int)(input)); + n_errs++; + } +} + +NOINLINE void test_ymm(int shift, int mulp) { + ALIGNTO(32) char buf[32]; + int i; + __m256i yvar; + + for (i = 0; i < 32; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&yvar, buf, 32); + + check_mask32(_mm256_movepi8_mask(yvar), calc_expected_mask_val(buf, 1, 32), + "_mm256_movepi8_mask", buf); + check_mask32(_mm256_movepi16_mask(yvar), calc_expected_mask_val(buf, 2, 16), + "_mm256_movepi16_mask", buf); + + check_ymm_arr(_mm256_movm_epi8((__mmask32)shift * mulp), + calc_expected_vec_val(shift * mulp, 32, 1, buf), + "_mm256_movm_epi8", (__mmask32)shift * mulp); + check_ymm_arr(_mm256_movm_epi16((__mmask32)shift * mulp), + calc_expected_vec_val(shift * mulp, 16, 2, buf), + "_mm256_movm_epi16", (__mmask32)shift * mulp); +} + +NOINLINE void check_mask64(__mmask64 res_mask, __mmask64 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%llx != 0x%llx, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 64; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + } +} + +NOINLINE void check_zmm_arr(const __m512i zvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&zvar; + + if (memcmp((void *)p, (void *)buf, 64) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 64; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 64; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%08llx\n", input); + n_errs++; + } +} + +NOINLINE void test_zmm(int shift, int mulp) { + ALIGNTO(64) char buf[64]; + int i; + __m512i zvar; + + for (i = 0; i < 64; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&zvar, buf, 64); + + check_mask64(_mm512_movepi8_mask(zvar), calc_expected_mask_val(buf, 1, 64), + "_mm512_movepi8_mask", buf); + check_mask64(_mm512_movepi16_mask(zvar), calc_expected_mask_val(buf, 2, 32), + "_mm512_movepi16_mask", buf); + + check_zmm_arr(_mm512_movm_epi8((__mmask64)shift * mulp), + calc_expected_vec_val(shift * mulp, 64, 1, buf), + "_mm512_movm_epi8", (__mmask64)shift * mulp); + check_zmm_arr(_mm512_movm_epi16((__mmask64)shift * mulp), + calc_expected_vec_val(shift * mulp, 32, 2, buf), + "_mm512_movm_epi16", (__mmask64)shift * mulp); +} + +NOINLINE void test_all() { + int shift, mulp; + + for (mulp = -1000; mulp < 1000; mulp += 10) { + for (shift = 0; shift < 64; shift++) { + test_xmm(shift, mulp); + test_ymm(shift, mulp); + test_zmm(shift, mulp); + } + } +} + +int main(void) { + test_all(); + + if (n_errs != 0) { + printf("FAILED, n_errs = %d\n", n_errs); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c @@ -0,0 +1,208 @@ + +/* + * Test load, copy and store intrinsics related to integer move instructions. + */ + +#include "m512_test_util.h" + +V512 i8_src1; +V512 i8_src2; +V512 i16_src1; +V512 i16_src2; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8_src1.s8[i] = i; + i8_src2.s8[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 32; i++) { + i16_src1.s16[i] = i; + i16_src2.s16[i] = (i & 1) ? i : -i; + } +} + +/* + * Use "soft update" between tests to make compiler think src was updated. + * Prevents PRE'ing a load of src, thus allowing ciscization. + * Also prevents PRE'ing intrinsic operations, ensuring we + * execute the intended instructions. + */ +volatile int vol0 = 0; +#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0] + +#define BLANK + +#define GEN_MASK_I8_LOAD(oper) GEN_MASK_I8(oper, &) +#define GEN_MASK_I8_COPY(oper) GEN_MASK_I8(oper, BLANK) + +#define GEN_MASK_I8(oper, addr_of) \ + void NOINLINE do_##oper() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask64 k64 = 0xabcdeffe97febdca; \ + __mmask32 k32 = (__mmask32)k64; \ + __mmask16 k16 = (__mmask16)k64; \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i8_src2); \ + zmm_res.zmmi = \ + _mm512_mask_##oper(i8_src1.zmmi, k64, addr_of i8_src2.zmmi); \ + soft_v512_update(i8_src2); \ + ymm_res.ymmi[0] = \ + _mm256_mask_##oper(i8_src1.ymmi[0], k32, addr_of i8_src2.ymmi[0]); \ + soft_v512_update(i8_src2); \ + xmm_res.xmmi[0] = \ + _mm_mask_##oper(i8_src1.xmmi[0], k16, addr_of i8_src2.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi32(1.0); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i8_src1); \ + zmm_res.zmmi = _mm512_maskz_##oper(k64, addr_of i8_src1.zmmi); \ + soft_v512_update(i8_src1); \ + ymm_res.ymmi[0] = _mm256_maskz_##oper(k32, addr_of i8_src1.ymmi[0]); \ + soft_v512_update(i8_src1); \ + xmm_res.xmmi[0] = _mm_maskz_##oper(k16, addr_of i8_src1.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_maskz_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_maskz_" #oper, __LINE__); \ + } + +#define GEN_MASK_I8_STORE(oper) \ + void NOINLINE do_##oper() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask64 k64 = 0xabcdeffe97febdca; \ + __mmask32 k32 = (__mmask32)k64; \ + __mmask16 k16 = (__mmask16)k64; \ + \ + /* Masked. */ \ + \ + zmm_res = i16_src1; \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i8_src2); \ + _mm512_mask_##oper(&zmm_res.zmmi, k64, i8_src2.zmmi); \ + soft_v512_update(i8_src2); \ + soft_v512_update(ymm_res); \ + _mm256_mask_##oper(&ymm_res.ymmi[0], k32, i8_src2.ymmi[0]); \ + soft_v512_update(i8_src2); \ + soft_v512_update(xmm_res); \ + _mm_mask_##oper(&xmm_res.xmmi[0], k16, i8_src2.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \ + } + +#define GEN_MASK_I16_LOAD(oper) GEN_MASK_I16(oper, &) +#define GEN_MASK_I16_COPY(oper) GEN_MASK_I16(oper, BLANK) + +#define GEN_MASK_I16(oper, addr_of) \ + void NOINLINE do_##oper() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask32 k32 = 0xcfe97dba; \ + __mmask16 k16 = (__mmask16)k32; \ + __mmask8 k8 = (__mmask8)k32; \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i16_src2); \ + zmm_res.zmmi = \ + _mm512_mask_##oper(i16_src1.zmmi, k32, addr_of i16_src2.zmmi); \ + soft_v512_update(i16_src2); \ + ymm_res.ymmi[0] = \ + _mm256_mask_##oper(i16_src1.ymmi[0], k16, addr_of i16_src2.ymmi[0]); \ + soft_v512_update(i16_src2); \ + xmm_res.xmmi[0] = \ + _mm_mask_##oper(i16_src1.xmmi[0], k8, addr_of i16_src2.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi32(1.0); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i16_src1); \ + zmm_res.zmmi = _mm512_maskz_##oper(k32, addr_of i16_src1.zmmi); \ + soft_v512_update(i16_src1); \ + ymm_res.ymmi[0] = _mm256_maskz_##oper(k16, addr_of i16_src1.ymmi[0]); \ + soft_v512_update(i16_src1); \ + xmm_res.xmmi[0] = _mm_maskz_##oper(k8, addr_of i16_src1.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_maskz_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_maskz_" #oper, __LINE__); \ + } + +#define GEN_MASK_I16_STORE(oper) \ + void NOINLINE do_##oper() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask32 k32 = 0xcfe97dba; \ + __mmask16 k16 = (__mmask16)k32; \ + __mmask8 k8 = (__mmask8)k32; \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i16_src2); \ + _mm512_mask_##oper(&zmm_res.zmmi, k32, i16_src2.zmmi); \ + soft_v512_update(i16_src2); \ + soft_v512_update(ymm_res); \ + _mm256_mask_##oper(&ymm_res.ymmi[0], k16, i16_src2.ymmi[0]); \ + soft_v512_update(i16_src2); \ + soft_v512_update(xmm_res); \ + _mm_mask_##oper(&xmm_res.xmmi[0], k8, i16_src2.xmmi[0]); \ + \ + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \ + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \ + } + +GEN_MASK_I8_LOAD(loadu_epi8) +GEN_MASK_I8_COPY(mov_epi8) +GEN_MASK_I8_STORE(storeu_epi8) + +GEN_MASK_I16_LOAD(loadu_epi16) +GEN_MASK_I16_COPY(mov_epi16) +GEN_MASK_I16_STORE(storeu_epi16) + +int main() { + init(); + + do_loadu_epi8(); + do_mov_epi8(); + do_storeu_epi8(); + + do_loadu_epi16(); + do_mov_epi16(); + do_storeu_epi16(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/sets.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/sets.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/sets.c @@ -0,0 +1,316 @@ +/* + * Test the "set" intrinsics. + * + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mask_set1_epi* + * _mm_maskz_set1_epi* + * _mm256_mask_set1_epi* + * _mm256_maskz_set1_epi* + * _mm512_mask_set1_epi* + * _mm512_maskz_set1_epi* + * _mm512_set1_epi* + */ + +#include "m512_test_util.h" + +volatile int vol0 = 0; + +/* Some scalars that can be ciscized. */ + +unsigned char char6 = 6; +unsigned short short7 = 7; +int int11 = 11; +__int64 int64_13 = 13; + +void NOINLINE invalidate_scalars() { + /* Make compiler think these variables could have an arbitrary value. */ + char6 += vol0; + short7 += vol0; + int11 += vol0; + int64_13 += vol0; +} + +void NOINLINE do_set1_epi8() { + V512 res, xres, yres; + V512 expected; + __mmask64 k = 0xffeebb97abcdffe9; + __mmask32 k32 = (__mmask32)k; + __mmask16 k16 = (__mmask16)k32; + int i; + + res.zmmi = _mm512_set1_epi8(9); + expected.zmmi = _mm512_set1_epi32(0x09090909); + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi8", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi8(char6); + expected.zmmi = _mm512_set1_epi32(0x06060606); + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi8 unknown", __LINE__); + + invalidate_scalars(); + + /* Masked */ + res.zmmi = _mm512_set1_epi32(-1); + + xres.xmmi[0] = _mm_mask_set1_epi8(res.xmmi[0], k16, 14); + yres.ymmi[0] = _mm256_mask_set1_epi8(res.ymmi[0], k32, 14); + res.zmmi = _mm512_mask_set1_epi8(res.zmmi, k, 14); + + expected.zmmi = _mm512_set1_epi32(0x0e0e0e0e); + for (i = 0; i < 64; i++) { + if ((k & ((__mmask64)1 << i)) == 0) { + expected.s8[i] = -1; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi8", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi8", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi8", __LINE__); + + invalidate_scalars(); + /* Zero masked */ + + res.zmmi = _mm512_maskz_set1_epi8(k, 19); + xres.xmmi[0] = _mm_maskz_set1_epi8(k16, 19); + yres.ymmi[0] = _mm256_maskz_set1_epi8(k32, 19); + + expected.zmmi = _mm512_set1_epi32(0x13131313); + for (i = 0; i < 64; i++) { + if ((k & ((__mmask64)1 << i)) == 0) { + expected.s8[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi8", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi8", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi8", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi8(k, char6); + expected.zmmi = _mm512_set1_epi32(0x06060606); + for (i = 0; i < 64; i++) { + if ((k & ((__mmask64)1 << i)) == 0) { + expected.s8[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi8 unknown", + __LINE__); +} + +void NOINLINE do_set1_epi16() { + V512 res, xres, yres; + V512 expected; + __mmask32 k = 0xabcdffe9; + __mmask16 k16 = (__mmask16)k; + __mmask8 k8 = (__mmask8)k16; + int i; + + res.zmmi = _mm512_set1_epi16(9); + expected.zmmi = _mm512_set1_epi32((9 << 16) | 9); + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi16", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi16(short7); + expected.zmmi = _mm512_set1_epi32((7 << 16) | 7); + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi16 unknown", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi32(-1); + + xres.xmmi[0] = _mm_mask_set1_epi16(res.xmmi[0], k8, 14); + yres.ymmi[0] = _mm256_mask_set1_epi16(res.ymmi[0], k16, 14); + res.zmmi = _mm512_mask_set1_epi16(res.zmmi, k, 14); + + expected.zmmi = _mm512_set1_epi32((14 << 16) | 14); + for (i = 0; i < 32; i++) { + if ((k & (1 << i)) == 0) { + expected.s16[i] = -1; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi16", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi16", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi16", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi16(k, 19); + xres.xmmi[0] = _mm_maskz_set1_epi16(k8, 19); + yres.ymmi[0] = _mm256_maskz_set1_epi16(k16, 19); + + expected.zmmi = _mm512_set1_epi32((19 << 16) | 19); + for (i = 0; i < 32; i++) { + if ((k & (1 << i)) == 0) { + expected.s16[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi16", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi16", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi16", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi16(k, short7); + expected.zmmi = _mm512_set1_epi32((7 << 16) | 7); + for (i = 0; i < 32; i++) { + if ((k & (1 << i)) == 0) { + expected.s16[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi16 unknown", + __LINE__); +} + +void NOINLINE do_set1_epi32() { + V512 res, xres, yres; + V512 expected; + __mmask16 k = 0xf7e6; + __mmask8 k8 = (__mmask8)k; + volatile int i; + + res.zmmi = _mm512_set1_epi32(9); + for (i = 0; i < 16; i++) { + expected.s32[i] = 9; + } + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi32", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi32(int11); + for (i = 0; i < 16; i++) { + expected.s32[i] = 11; + } + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi32 unknown", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi32(-1); + xres.xmmi[0] = _mm_mask_set1_epi32(res.xmmi[0], k8, 14); + yres.ymmi[0] = _mm256_mask_set1_epi32(res.ymmi[0], k8, 14); + res.zmmi = _mm512_mask_set1_epi32(res.zmmi, k, 14); + + for (i = 0; i < 16; i++) { + if ((k & (1 << i)) == 0) { + expected.s32[i] = -1; + } else { + expected.s32[i] = 14; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi32", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi32", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi32", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi32(k, 19); + xres.xmmi[0] = _mm_maskz_set1_epi32(k8, 19); + yres.ymmi[0] = _mm256_maskz_set1_epi32(k8, 19); + + for (i = 0; i < 16; i++) { + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } else { + expected.s32[i] = 19; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi32", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi32", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi32", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi32(k, int11); + for (i = 0; i < 16; i++) { + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } else { + expected.s32[i] = 11; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi32 unknown", + __LINE__); +} + +void NOINLINE do_set1_epi64() { + V512 res, xres, yres; + V512 expected; + __mmask8 k = 0xe7; + volatile int i; + + res.zmmi = _mm512_set1_epi64(9); + for (i = 0; i < 8; i++) { + expected.s64[i] = 9; + } + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi64", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi64(int64_13); + for (i = 0; i < 8; i++) { + expected.s64[i] = 13; + } + check_equal_nd(&res, &expected, 16, "_mm512_set1_epi64 unknown", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_set1_epi64(-1); + xres.xmmi[0] = _mm_mask_set1_epi64(res.xmmi[0], k, 14); + yres.ymmi[0] = _mm256_mask_set1_epi64(res.ymmi[0], k, 14); + res.zmmi = _mm512_mask_set1_epi64(res.zmmi, k, 14); + for (i = 0; i < 8; i++) { + if ((k & (1 << i)) == 0) { + expected.s64[i] = -1; + } else { + expected.s64[i] = 14; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi64", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi64", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi64", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi64(k, 19); + xres.xmmi[0] = _mm_maskz_set1_epi64(k, 19); + yres.ymmi[0] = _mm256_maskz_set1_epi64(k, 19); + for (i = 0; i < 8; i++) { + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } else { + expected.s64[i] = 19; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi64", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi64", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi64", __LINE__); + + invalidate_scalars(); + + res.zmmi = _mm512_maskz_set1_epi64(k, int64_13); + for (i = 0; i < 8; i++) { + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } else { + expected.s64[i] = 13; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi64 unknown", + __LINE__); +} + +int main(int argc, char *argv[]) { + do_set1_epi8(); + do_set1_epi16(); + do_set1_epi32(); + do_set1_epi64(); + + if (n_errs) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c @@ -0,0 +1,1253 @@ +/* + * Test intrinsics related to integer down-converting instructions + * like vpmovdb, where the source values are in an __m128i value. + * + * This test was created to check the correctness + * of the following AVX512 intrinsics support: + *_mm_cvtepi* + *_mm_cvtsepi* + *_mm_cvtusepi* + *_mm_mask_cvtepi* + *_mm_mask_cvtsepi* + *_mm_mask_cvtusepi* + *_mm_maskz_cvtepi* + *_mm_maskz_cvtsepi* + *_mm_maskz_cvtusepi* + */ + +#include "m512_test_util.h" +#include + +volatile int vol0 = 0; + +#define soft_src_update(var) var.xmmi[vol0] = var.xmmi[vol0] + +V512 i8; +V512 i16; +V512 i16_mix; +V512 i16_big; +V512 i32; +V512 i32_mix; +V512 i32_big; +V512 i64; +V512 i64_mix; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = i; + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = i; + i16_mix.s16[i] = (i & 1) ? i : -i; + i16_big.s16[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i16_big.s16[i] = -i16_big.s16[i]; + } + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_mix.s32[i] = (i & 1) ? i : -i; + i32_big.s32[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i32_big.s32[i] = -i32_big.s32[i]; + } + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + } +} + +/* Saturation utility functions for emulation. */ + +static signed char NOINLINE sat16_8(short s) { + return (s < -128) ? -128 : ((s > 127) ? 127 : s); +} + +static unsigned char NOINLINE usat16_8(unsigned short s) { + return (s > 255) ? 255 : s; +} + +static signed char NOINLINE sat32_8(int s) { + return (s < -128) ? -128 : ((s > 127) ? 127 : s); +} + +static unsigned char usat32_8(unsigned int s) { return (s > 255) ? 255 : s; } + +static short NOINLINE sat32_16(int s) { + return (s < (int)0xffff8000) ? (int)0xffff8000 + : ((s > (int)0x7fff) ? (int)0x7fff : s); +} + +static unsigned short NOINLINE usat32_16(unsigned int s) { + return (s > (unsigned int)0xffff) ? (unsigned int)0xffff : s; +} + +static signed char NOINLINE sat64_8(__int64 s) { + return (s < -128) ? -128 : ((s > 127) ? 127 : s); +} + +static unsigned char NOINLINE usat64_8(unsigned __int64 s) { + return (s > 255) ? 255 : s; +} + +static short NOINLINE sat64_16(__int64 s) { + return (s < SHRT_MIN) ? SHRT_MIN : ((s > SHRT_MAX) ? SHRT_MAX : s); +} + +static unsigned short NOINLINE usat64_16(unsigned __int64 s) { + return (s > USHRT_MAX) ? USHRT_MAX : s; +} + +static int NOINLINE sat64_32(__int64 s) { + return (s < INT_MIN) ? INT_MIN : ((s > INT_MAX) ? INT_MAX : s); +} + +static unsigned int NOINLINE usat64_32(unsigned __int64 s) { + return (s > UINT_MAX) ? UINT_MAX : s; +} + +void NOINLINE do_pmovwb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; + + res.xmmi[0] = _mm_cvtepi16_epi8(i16.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + expected.s8[i] = i16.s16[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi16_epi8", __LINE__); + + soft_src_update(i16); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi16_epi8(res.xmmi[1], k, i16.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i16.s16[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi16_epi8", __LINE__); + + soft_src_update(i16); + res.xmmi[0] = _mm_maskz_cvtepi16_epi8(k, i16.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i16.s16[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi16_epi8", __LINE__); + + soft_src_update(i16); + res.xmmi[vol0] = i16_big.xmmi[0]; + _mm_mask_cvtepi16_storeu_epi8(&res.xmmi[0], k, i16.xmmi[0]); + expected.xmmi[0] = i16_big.xmmi[0]; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i16.s16[i]; + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi16_storeu_epi8", __LINE__); +} + +void NOINLINE do_pmovswb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdc; + + res.xmmi[0] = _mm_cvtsepi16_epi8(i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + expected.s8[i] = sat16_8(i16_mix.s16[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi16_epi8(res.xmmi[1], k, i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat16_8(i16_mix.s16[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[0] = _mm_maskz_cvtsepi16_epi8(k, i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat16_8(i16_mix.s16[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[vol0] = i16_big.xmmi[0]; + _mm_mask_cvtsepi16_storeu_epi8(&res.xmmi[0], k, i16_mix.xmmi[0]); + expected.xmmi[0] = i16_big.xmmi[0]; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat16_8(i16_mix.s16[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi16_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovuswb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xfd; + + res.xmmi[0] = _mm_cvtusepi16_epi8(i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + expected.s8[i] = usat16_8(i16_mix.u16[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi16_epi8(res.xmmi[1], k, i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat16_8(i16_mix.u16[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[0] = _mm_maskz_cvtusepi16_epi8(k, i16_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat16_8(i16_mix.u16[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi16_epi8", __LINE__); + + soft_src_update(i16_mix); + res.xmmi[vol0] = i16_big.xmmi[0]; + _mm_mask_cvtusepi16_storeu_epi8(&res.xmmi[0], k, i16_mix.xmmi[0]); + expected.xmmi[0] = i16_big.xmmi[0]; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat16_8(i16_mix.u16[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi16_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovdb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xab; + + res.xmmi[0] = _mm_cvtepi32_epi8(i32.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + expected.s8[i] = i32.s32[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi32_epi8(res.xmmi[1], k, i32.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[0] = _mm_maskz_cvtepi32_epi8(k, i32.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[vol0] = i32_big.xmmi[0]; + _mm_mask_cvtepi32_storeu_epi8(&res.xmmi[0], k, i32.xmmi[0]); + expected.xmmi[0] = i32_big.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi32_storeu_epi8", __LINE__); +} + +void NOINLINE do_pmovsdb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; + + res.xmmi[0] = _mm_cvtsepi32_epi8(i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + expected.s8[i] = sat32_8(i32_mix.s32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi32_epi8(res.xmmi[1], k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat32_8(i32_mix.s32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm_maskz_cvtsepi32_epi8(k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat32_8(i32_mix.s32[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[vol0] = i32_big.xmmi[0]; + _mm_mask_cvtsepi32_storeu_epi8(&res.xmmi[0], k, i32_mix.xmmi[0]); + expected.xmmi[0] = i32_big.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat32_8(i32_mix.s32[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi32_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovusdb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xfd; + + res.xmmi[0] = _mm_cvtusepi32_epi8(i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + expected.s8[i] = usat32_8(i32_mix.u32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi32_epi8(res.xmmi[1], k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat32_8(i32_mix.u32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm_maskz_cvtusepi32_epi8(k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat32_8(i32_mix.u32[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[vol0] = i32_big.xmmi[0]; + _mm_mask_cvtusepi32_storeu_epi8(&res.xmmi[0], k, i32_mix.xmmi[0]); + expected.xmmi[0] = i32_big.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat32_8(i32_mix.u32[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi32_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovdw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcd; + + res.xmmi[0] = _mm_cvtepi32_epi16(i32.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + expected.s16[i] = i32.s32[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i8.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi32_epi16(res.xmmi[1], k, i32.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.xmmi[0] = _mm_maskz_cvtepi32_epi16(k, i32.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtepi32_storeu_epi16(&res.xmmi[0], k, i32.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovsdw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xfe; + + res.xmmi[0] = _mm_cvtsepi32_epi16(i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + expected.s16[i] = sat32_16(i32_mix.s32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi32_epi16(res.xmmi[1], k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat32_16(i32_mix.s32[i]); + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm_maskz_cvtsepi32_epi16(k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat32_16(i32_mix.s32[i]); + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtsepi32_storeu_epi16(&res.xmmi[0], k, i32_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat32_16(i32_mix.s32[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovusdw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xfe; + + res.xmmi[0] = _mm_cvtusepi32_epi16(i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + expected.u16[i] = usat32_16(i32_mix.u32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi32_epi16(res.xmmi[1], k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.u16[i] = usat32_16(i32_mix.u32[i]); + } else { + expected.u16[i] = res.u16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm_maskz_cvtusepi32_epi16(k, i32_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.u16[i] = usat32_16(i32_mix.u32[i]); + } else { + expected.u16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtusepi32_storeu_epi16(&res.xmmi[0], k, i32_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 4; i++) { + if ((1 << i) & k) { + expected.s16[i] = usat32_16(i32_mix.s32[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x76; + + res.xmmi[0] = _mm_cvtepi64_epi8(i64.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.s8[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi8", __LINE__); + + /* + * Exercise ciscization. + */ + + _mm_store_sd(&res.f64[2], _mm_castsi128_pd(_mm_cvtepi64_epi8(i64.xmmi[0]))); + check_equal_nd(&res.f64[2], &expected, 2, "_mm_cvtepi64_epi8 ciscized", + __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi64_epi8(res.xmmi[1], k, i64.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi8", __LINE__); + + soft_src_update(i64); + res.xmmi[0] = _mm_maskz_cvtepi64_epi8(k, i64.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi64_epi8", __LINE__); + + soft_src_update(i64); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtepi64_storeu_epi8(&res.xmmi[0], k, i64.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi64_storeu_epi8", __LINE__); +} + +void NOINLINE do_pmovsqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x67; + + res.xmmi[0] = _mm_cvtsepi64_epi8(i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.s8[i] = sat64_8(i64_mix.s64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi64_epi8(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat64_8(i64_mix.s64[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtsepi64_epi8(k, i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat64_8(i64_mix.s64[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtsepi64_storeu_epi8(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = sat64_8(i64_mix.s64[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi64_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovusqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x67; + + res.xmmi[0] = _mm_cvtusepi64_epi8(i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi64_epi8(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat64_8(i64_mix.u64[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtusepi64_epi8(k, i64_mix.xmmi[0]); + + expected.u64[0] = 0; + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat64_8(i64_mix.u64[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtusepi64_storeu_epi8(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s8[i] = usat64_8(i64_mix.u64[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi64_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + + res.xmmi[0] = _mm_cvtepi64_epi16(i64.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + expected.s16[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi64_epi16(res.xmmi[1], k, i64.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[0] = _mm_maskz_cvtepi64_epi16(k, i64.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtepi64_storeu_epi16(&res.xmmi[0], k, i64.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovsqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + + res.xmmi[0] = _mm_cvtsepi64_epi16(i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + expected.s16[i] = sat64_16(i64_mix.s64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi64_epi16(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat64_16(i64_mix.s64[i]); + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtsepi64_epi16(k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat64_16(i64_mix.s64[i]); + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtsepi64_storeu_epi16(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = sat64_16(i64_mix.s64[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovusqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + unsigned __int64 r; + + res.xmmi[0] = _mm_cvtusepi64_epi16(i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + expected.s16[i] = usat64_16(i64_mix.u64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi64_epi16(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = usat64_16(i64_mix.u64[i]); + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtusepi64_epi16(k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + expected.u64[0] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = usat64_16(i64_mix.u64[i]); + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtusepi64_storeu_epi16(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s16[i] = usat64_16(i64_mix.u64[i]); + } + } + + check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + + res.xmmi[0] = _mm_cvtepi64_epi32(i64.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.s32[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i8.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtepi64_epi32(res.xmmi[1], k, i64.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } else { + expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.xmmi[0] = _mm_maskz_cvtepi64_epi32(k, i64.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtepi64_storeu_epi32(&res.xmmi[0], k, i64.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi64_storeu_epi32", + __LINE__); +} + +void NOINLINE do_pmovsqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + + res.xmmi[0] = _mm_cvtsepi64_epi32(i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.s32[i] = sat64_32(i64_mix.s64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i8.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtsepi64_epi32(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = sat64_32(i64_mix.s64[i]); + } else { + expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtsepi64_epi32(k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = sat64_32(i64_mix.s64[i]); + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtsepi64_storeu_epi32(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = sat64_32(i64_mix.s64[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi64_storeu_epi32", + __LINE__); +} + +void NOINLINE do_pmovusqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + + res.xmmi[0] = _mm_cvtusepi64_epi32(i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + expected.u32[i] = usat64_32(i64_mix.u64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i8.xmmi[1]; + res.xmmi[0] = _mm_mask_cvtusepi64_epi32(res.xmmi[1], k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.u32[i] = usat64_32(i64_mix.u64[i]); + } else { + expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm_maskz_cvtusepi64_epi32(k, i64_mix.xmmi[0]); + + expected.u64[1] = 0; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.u32[i] = usat64_32(i64_mix.u64[i]); + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[vol0] = i8.xmmi[0]; + _mm_mask_cvtusepi64_storeu_epi32(&res.xmmi[0], k, i64_mix.xmmi[0]); + expected.xmmi[0] = i8.xmmi[0]; + for (i = 0; i < 2; i++) { + if ((1 << i) & k) { + expected.s32[i] = usat64_32(i64_mix.u64[i]); + } + } + + check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi64_storeu_epi32", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_pmovwb(); + do_pmovswb(); + do_pmovuswb(); + + do_pmovdb(); + do_pmovsdb(); + do_pmovusdb(); + + do_pmovdw(); + do_pmovsdw(); + do_pmovusdw(); + + do_pmovqb(); + do_pmovsqb(); + do_pmovusqb(); + + do_pmovqw(); + do_pmovsqw(); + do_pmovusqw(); + + do_pmovqd(); + do_pmovsqd(); + do_pmovusqd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512DQVL-") Index: SingleSource/UnitTests/Vector/AVX512DQVL/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQVL/Makefile +++ SingleSource/UnitTests/Vector/AVX512DQVL/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512DQVL/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512dq -mavx512vl -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512dq -mavx512vl +LCCFLAGS += -march=native -mavx512dq -mavx512vl Index: SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c +++ SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c @@ -0,0 +1,234 @@ +/* + * Exercise intrinsics for a instructions which set mask register + * by values in vector registers and set vector register value by + * values in mask register. + */ + +#include "m512_test_util.h" + +__int64 calc_expected_mask_val(const char *valp, int el_size, int length) { + __int64 rval = 0; + int i; + + for (i = 0; i < length; i++) { + if ((valp[el_size * i + (el_size - 1)] & 0x80) != 0) { + rval |= (1LL << i); + } + } + + return rval; +} + +char *calc_expected_vec_val(__mmask64 mask_val, int mask_size, int el_size, + char *buf) { + int i, j; + + for (i = 0; i < mask_size * el_size; buf[i++] = 0) + ; + + for (i = 0; i < mask_size; i++) { + if ((mask_val & (1LL << i)) != 0) { + for (j = 0; j < el_size; j++) { + buf[i * el_size + j] = 0xff; + } + } + } + + return buf; +} + +NOINLINE void check_mask16(__mmask16 res_mask, __mmask16 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 16; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + } +} + +NOINLINE void check_xmm_arr(const __m128i xvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&xvar; + + if (memcmp((void *)p, (void *)buf, 16) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 16; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 16; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%04x\n", (int)(input)&0xffff); + n_errs++; + } +} + +NOINLINE void test_xmm(int shift, int mulp) { + ALIGNTO(16) char buf[16]; + int i; + __m128i xvar; + + for (i = 0; i < 16; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&xvar, buf, 16); + + check_mask16(_mm_movepi32_mask(xvar), calc_expected_mask_val(buf, 4, 4), + "_mm_movepi32_mask", buf); + check_mask16(_mm_movepi64_mask(xvar), calc_expected_mask_val(buf, 8, 2), + "_mm_movepi64_mask", buf); + + check_xmm_arr(_mm_movm_epi32((__mmask16)shift * mulp), + calc_expected_vec_val(shift * mulp, 4, 4, buf), + "_mm_movm_epi32", (__mmask16)shift * mulp); + check_xmm_arr(_mm_movm_epi64((__mmask16)shift * mulp), + calc_expected_vec_val(shift * mulp, 2, 8, buf), + "_mm_movm_epi64", (__mmask16)shift * mulp); +} + +NOINLINE void check_mask32(__mmask32 res_mask, __mmask32 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 32; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + exit(1); + } +} + +NOINLINE void check_ymm_arr(const __m256i yvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&yvar; + + if (memcmp((void *)p, (void *)buf, 32) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 32; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 32; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%04x\n", (int)(input)); + n_errs++; + } +} + +NOINLINE void test_ymm(int shift, int mulp) { + ALIGNTO(32) char buf[32]; + int i; + __m256i yvar; + + for (i = 0; i < 32; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&yvar, buf, 32); + + check_mask32(_mm256_movepi32_mask(yvar), calc_expected_mask_val(buf, 4, 8), + "_mm256_movepi32_mask", buf); + check_mask32(_mm256_movepi64_mask(yvar), calc_expected_mask_val(buf, 8, 4), + "_mm256_movepi64_mask", buf); + + check_ymm_arr(_mm256_movm_epi32((__mmask32)shift * mulp), + calc_expected_vec_val(shift * mulp, 8, 4, buf), + "_mm256_movm_epi32", (__mmask32)shift * mulp); + check_ymm_arr(_mm256_movm_epi64((__mmask32)shift * mulp), + calc_expected_vec_val(shift * mulp, 4, 8, buf), + "_mm256_movm_epi64", (__mmask32)shift * mulp); +} + +NOINLINE void check_mask64(__mmask64 res_mask, __mmask64 exp_mask, + const char *fname, const char *input) { + int i; + + if (res_mask != exp_mask) { + printf("%s: 0x%llx != 0x%llx, input = ", fname, res_mask, exp_mask); + for (i = 0; i < 64; i++) { + printf("%02x ", input[i] & 0xff); + } + printf("\n"); + n_errs++; + } +} + +NOINLINE void check_zmm_arr(const __m512i zvar, char *buf, const char *fname, + __mmask64 input) { + int i; + char *p = (char *)&zvar; + + if (memcmp((void *)p, (void *)buf, 64) != 0) { + printf("%s: 0x", fname); + for (i = 0; i < 64; i++) { + printf(" %02x", p[i] & 0xff); + } + printf(" != 0x"); + for (i = 0; i < 64; i++) { + printf(" %02x", buf[i] & 0xff); + } + printf(", input = 0x%08llx\n", input); + n_errs++; + } +} + +NOINLINE void test_zmm(int shift, int mulp) { + ALIGNTO(64) char buf[64]; + int i; + __m512i zvar; + + for (i = 0; i < 64; i++) { + buf[i] = (i << shift) * mulp; + } + + memcpy(&zvar, buf, 64); + + check_mask64(_mm512_movepi32_mask(zvar), calc_expected_mask_val(buf, 4, 16), + "_mm512_movepi32_mask", buf); + check_mask64(_mm512_movepi64_mask(zvar), calc_expected_mask_val(buf, 8, 8), + "_mm512_movepi64_mask", buf); + + check_zmm_arr(_mm512_movm_epi32((__mmask64)shift * mulp), + calc_expected_vec_val(shift * mulp, 16, 4, buf), + "_mm512_movm_epi32", (__mmask64)shift * mulp); + check_zmm_arr(_mm512_movm_epi64((__mmask64)shift * mulp), + calc_expected_vec_val(shift * mulp, 8, 8, buf), + "_mm512_movm_epi64", (__mmask64)shift * mulp); +} + +NOINLINE void test_all() { + int shift, mulp; + + for (mulp = -1000; mulp < 1000; mulp += 10) { + for (shift = 0; shift < 64; shift++) { + test_xmm(shift, mulp); + test_ymm(shift, mulp); + test_zmm(shift, mulp); + } + } +} + +int main(void) { + test_all(); + + if (n_errs != 0) { + printf("FAILED, n_errs = %d\n", n_errs); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output +++ SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c +++ SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c @@ -0,0 +1,147 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_i32gather_epi64() + * _mm512_mask_i32gather_epi64() + * _mm512_i32gather_pd() + * _mm512_mask_i32gather_pd() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst512_f[NUM]; +double dst_f[NUM]; +__int64 dst512_i[NUM]; +__int64 dst_i[NUM]; +double src_f[NUM]; +__int64 src_i[NUM]; +__int64 mask512[NUM / 8]; +int g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM - 1); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst512_i[i] = -i; + dst512_f[i] = -i; + + if (i % 8 == 0) { + mask512[i / 8] = (i * 31) & 0xff; + } + } +} + +void do_mm512_mmask_i32gather_epi64() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512i old_dst = _mm512_loadu_si512((const __m512i *)(dst_i + i)); + __m512i gtr = + _mm512_mask_i32gather_epi64(old_dst, mask512[i / 8], ind, src_i, SCALE); + _mm512_storeu_si512((__m512i *)(dst512_i + i), gtr); + } +} + +void do_mm512_mmask_i32gather_pd() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512d old_dst = _mm512_loadu_pd(dst_f + i); + __m512d gtr = + _mm512_mask_i32gather_pd(old_dst, mask512[i / 8], ind, src_f, SCALE); + _mm512_storeu_pd(dst512_f + i, gtr); + } +} + +void do_mm512_i32gather_epi64() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512i gtr = _mm512_i32gather_epi64(ind, src_i, SCALE); + _mm512_storeu_si512((__m512i *)(dst512_i + i), gtr); + } +} + +void do_mm512_i32gather_pd() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512d gtr = _mm512_i32gather_pd(ind, src_f, SCALE); + _mm512_storeu_pd(dst512_f + i, gtr); + } +} + +int checkm(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask, + __int64 *src, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + __int64 kmask = mask[i / elems_in_vector]; + __int64 kmask_bit = kmask & (1 << (i % elems_in_vector)); + + __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %I64d, actual %I64d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int check(int id, __int64 *res_dst, __int64 *src, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + + __int64 v = src[g_index[i]]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %I64d, actual %I64d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm512_mmask_i32gather_epi64(); + error |= checkm(1, dst512_i, dst_i, mask512, src_i, 8); + + do_mm512_mmask_i32gather_pd(); + error |= checkm(2, (__int64 *)dst512_f, (__int64 *)dst_f, mask512, + (__int64 *)src_f, 8); + + init_data(); + + do_mm512_i32gather_epi64(); + error |= check(3, dst512_i, src_i, 8); + + do_mm512_i32gather_pd(); + error |= check(4, (__int64 *)dst512_f, (__int64 *)src_f, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c +++ SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c @@ -0,0 +1,134 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_i32scatter_epi64() + * _mm512_mask_i32scatter_epi64() + * _mm512_i32scatter_pd() + * _mm512_mask_i32scatter_pd() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst512_f[NUM], gold_dst512_f[NUM], full_gold_dst512_f[NUM]; +__int64 dst512_i[NUM], gold_dst512_i[NUM], full_gold_dst512_i[NUM]; +int mask512[NUM / 8]; +int full_mask512[NUM / 8]; +int g_index[NUM]; + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst512_i[i] = -1; + dst512_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 8 == 0) { + mask512[i / 8] = (i * 31) & 0xff; + full_mask512[i / 8] = 0xff; + } + + if ((mask512[i / 8] >> (i % 8)) & 0x1) { + gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = i; + } else { + gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = -1; + } + + full_gold_dst512_i[g_index[i]] = full_gold_dst512_f[g_index[i]] = i; + } +} + +void do_mm512_mask_i32scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512i val = + _mm512_set_epi64(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_mask_i32scatter_epi64(dst512_i, mask512[i / 8], ind, val, SCALE); + } +} + +void do_mm512_mask_i32scatter_pd() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512d val = + _mm512_set_pd(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_mask_i32scatter_pd(dst512_f, mask512[i / 8], ind, val, SCALE); + } +} + +void do_mm512_i32scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512i val = + _mm512_set_epi64(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_i32scatter_epi64(dst512_i, ind, val, SCALE); + } +} + +void do_mm512_i32scatter_pd() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m512d val = + _mm512_set_pd(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_i32scatter_pd(dst512_f, ind, val, SCALE); + } +} + +int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %I64d, actual %I64d, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm512_mask_i32scatter_epi64(); + error |= check(1, dst512_i, gold_dst512_i, mask512, 8); + + do_mm512_mask_i32scatter_pd(); + error |= check(2, (__int64 *)dst512_f, (__int64 *)gold_dst512_f, mask512, 8); + + init_data(); + + do_mm512_i32scatter_epi64(); + error |= check(3, dst512_i, full_gold_dst512_i, full_mask512, 8); + + do_mm512_i32scatter_pd(); + error |= check(4, (__int64 *)dst512_f, (__int64 *)full_gold_dst512_f, + full_mask512, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c +++ SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c @@ -0,0 +1,148 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_i64gather_epi32() + * _mm512_mask_i64gather_epi32() + * _mm512_i64gather_ps() + * _mm512_mask_i64gather_ps() + */ + +#include "m512_test_util.h" +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst512_f[NUM]; +float dst_f[NUM]; +int dst512_i[NUM]; +int dst_i[NUM]; +float src_f[NUM]; +int src_i[NUM]; +int mask512[NUM / 8]; +__int64 g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM - 1); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst512_i[i] = -i; + dst512_f[i] = -i; + + if (i % 8 == 0) { + mask512[i / 8] = (i * 31) & 0xff; + } + } +} + +void do_mm512_mask_i64gather_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i)); + + __m256i gtr = + _mm512_mask_i64gather_epi32(old_dst, mask512[i / 8], ind, src_i, SCALE); + + _mm256_storeu_si256((__m256i *)(dst512_i + i), gtr); + } +} + +void do_mm512_mask_i64gather_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256 old_dst = _mm256_loadu_ps(dst_f + i); + __m256 gtr = + _mm512_mask_i64gather_ps(old_dst, mask512[i / 8], ind, src_f, SCALE); + _mm256_storeu_ps(dst512_f + i, gtr); + } +} + +void do_mm512_i64gather_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256i gtr = _mm512_i64gather_epi32(ind, src_i, SCALE); + _mm256_storeu_si256((__m256i *)(dst512_i + i), gtr); + } +} + +void do_mm512_i64gather_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256 gtr = _mm512_i64gather_ps(ind, src_f, SCALE); + _mm256_storeu_ps(dst512_f + i, gtr); + } +} + +int checkm(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int check(int id, int *res_dst, int *src, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + + int v = src[g_index[i]]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm512_mask_i64gather_epi32(); + error |= checkm(1, dst512_i, dst_i, mask512, src_i, 8); + + do_mm512_mask_i64gather_ps(); + error |= checkm(2, (int *)dst512_f, (int *)dst_f, mask512, (int *)src_f, 8); + + init_data(); + + do_mm512_i64gather_epi32(); + error |= check(3, dst512_i, src_i, 8); + + do_mm512_i64gather_ps(); + error |= check(4, (int *)dst512_f, (int *)src_f, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c +++ SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c @@ -0,0 +1,132 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_i64scatter_epi32() + * _mm512_mask_i64scatter_epi32() + * _mm512_i64scatter_ps() + * _mm512_mask_i64scatter_ps() + */ + +#include "m512_test_util.h" +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst512_f[NUM], gold_dst512_f[NUM], full_gold_dst512_f[NUM]; +int dst512_i[NUM], gold_dst512_i[NUM], full_gold_dst512_i[NUM]; +int mask512[NUM / 8]; +int full_mask512[NUM / 8]; +__int64 g_index[NUM]; + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst512_i[i] = -1; + dst512_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 8 == 0) { + mask512[i / 8] = (i * 31) & 0xff; + full_mask512[i / 8] = 0xff; + } + + if ((mask512[i / 8] >> (i % 8)) & 0x1) { + gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = i; + } else { + gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = -1; + } + full_gold_dst512_i[g_index[i]] = full_gold_dst512_f[g_index[i]] = i; + } +} + +void do_mm512_mask_i64scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256i val = + _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_mask_i64scatter_epi32(dst512_i, mask512[i / 8], ind, val, SCALE); + } +} + +void do_mm512_mask_i64scatter_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256 val = + _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_mask_i64scatter_ps(dst512_f, mask512[i / 8], ind, val, SCALE); + } +} + +void do_mm512_i64scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256i val = + _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_i64scatter_epi32(dst512_i, ind, val, SCALE); + } +} + +void do_mm512_i64scatter_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i)); + __m256 val = + _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm512_i64scatter_ps(dst512_f, ind, val, SCALE); + } +} + +int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm512_mask_i64scatter_epi32(); + error |= check(1, dst512_i, gold_dst512_i, mask512, 8); + + do_mm512_mask_i64scatter_ps(); + error |= check(2, (int *)dst512_f, (int *)gold_dst512_f, mask512, 8); + + init_data(); + + do_mm512_i64scatter_epi32(); + error |= check(3, dst512_i, full_gold_dst512_i, full_mask512, 8); + + do_mm512_i64scatter_ps(); + error |= + check(4, (int *)dst512_f, (int *)full_gold_dst512_f, full_mask512, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c +++ SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c @@ -0,0 +1,479 @@ + +/* + * Test 128 and 256-bit load and store intrinsics, + * with masked and zero-masked forms, by comparing + * their output with the corresponding 512-bit intrinsic. + * + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_load_si512() + * _mm512_store_si512() + * _mm256_storeu_ps() + * _mm256_storeu_si256() + * _mm_storeu_ps() + * _mm_storeu_si128() + */ + +#include "m512_test_util.h" +#include + +V512 fsrc1; +V512 fsrc2; +V512 fsrc_non_negative; +V512 fsrc_arr[2]; + +V512 dsrc1; +V512 dsrc2; +V512 dsrc_non_negative; +V512 dsrc_arr[2]; + +V512 isrc1; +V512 isrc2; +V512 isrc_arr[2]; + +static void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + fsrc1.f32[i] = (float)(i + 1); + fsrc2.f32[i] = 4.0f * (float)(i + 1); + fsrc_non_negative.f32[i] = 2.0f * (float)(i); + isrc2.s32[i] = (3 * i) - 17; + } + + fsrc_arr[0] = fsrc2; + fsrc_arr[1] = fsrc1; + + for (i = 0; i < 64; i++) { + isrc1.s8[i] = 2 * i + 1; + if (i % 3) { + isrc1.s8[i] = -isrc1.s8[i]; + } + } + + isrc_arr[0] = isrc2; + isrc_arr[1] = isrc1; + + for (i = 0; i < 8; i++) { + dsrc1.f64[i] = (double)(-i - 1); + dsrc2.f64[i] = 3.0 * (double)(-i - 1); + dsrc_non_negative.f64[i] = 4.0f * (double)(i); + } + + dsrc_arr[0] = dsrc2; + dsrc_arr[1] = dsrc1; +} + +/* + * Use "soft update" between tests to make compiler think src was updated. + * Prevents PRE'ing a load of src, thus allowing ciscization. + * Also prevents PRE'ing intrinsic operations, ensuring we + * execute the intended instructions. + */ +volatile int vol0 = 0; +#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0] + +void NOINLINE do_load_and_loadu_pd() { + V512 xmm_res, ymm_res, zmm_res; + __mmask8 k8 = 0x7e; + + /* Non-masked. */ + + soft_v512_update(dsrc1); + zmm_res.zmmd = _mm512_load_pd(&dsrc1.zmmd); + + soft_v512_update(dsrc_arr[0]); + zmm_res.zmmd = _mm512_loadu_pd(&dsrc_arr[0].f64[1]); + + /* Masked. */ + + zmm_res.zmmd = _mm512_setzero_pd(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(dsrc1); + zmm_res.zmmd = _mm512_mask_load_pd(zmm_res.zmmd, k8, &dsrc1.zmmd); + + zmm_res.zmmd = _mm512_setzero_pd(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(dsrc_arr[0]); + zmm_res.zmmd = _mm512_mask_loadu_pd(zmm_res.zmmd, k8, &dsrc_arr[0].f64[3]); + + /* Zero-masked. */ + + zmm_res.zmmd = _mm512_set1_pd(1.0); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(dsrc1); + zmm_res.zmmd = _mm512_maskz_load_pd(k8, &dsrc1.zmmd); + soft_v512_update(dsrc1); + + zmm_res.zmmd = _mm512_set1_pd(1.0); + ymm_res = zmm_res; + xmm_res = zmm_res; +} + +void NOINLINE do_store_and_storeu_pd() { + V512 xmm_res[2], ymm_res[2], zmm_res[2]; + __mmask8 k8 = 0xef; + + /* Non-masked. */ + + zmm_res[0].zmmd = _mm512_set1_pd(1.0); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(dsrc1); + _mm512_store_pd(&zmm_res[0].zmmd, dsrc1.zmmd); + + soft_v512_update(dsrc1); + _mm512_storeu_pd(&zmm_res[0].f64[1], dsrc1.zmmd); + + /* Masked. */ + + zmm_res[0].zmmd = _mm512_set1_pd(1.0); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(dsrc1); + _mm512_mask_store_pd(&zmm_res[0].zmmd, k8, dsrc1.zmmd); + + zmm_res[0].zmmd = _mm512_set1_pd(1.0); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(dsrc1); + _mm512_mask_storeu_pd(&zmm_res[0].f64[1], k8, dsrc1.zmmd); +} + +void NOINLINE do_load_and_loadu_ps() { + V512 xmm_res, ymm_res, zmm_res; + __mmask16 k16 = 0xff7e; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + soft_v512_update(fsrc1); + zmm_res.zmm = _mm512_load_ps(&fsrc1.zmm); + + soft_v512_update(fsrc_arr[0]); + zmm_res.zmm = _mm512_loadu_ps(&fsrc_arr[0].f32[3]); + + /* Masked. */ + + zmm_res.zmm = _mm512_setzero_ps(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(fsrc1); + zmm_res.zmm = _mm512_mask_load_ps(zmm_res.zmm, k16, &fsrc1.zmm); + + zmm_res.zmm = _mm512_setzero_ps(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(fsrc_arr[0]); + zmm_res.zmm = _mm512_mask_loadu_ps(zmm_res.zmm, k16, &fsrc_arr[0].f32[5]); + + /* Zero-masked. */ + + zmm_res.zmm = _mm512_set1_ps(1.0f); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(fsrc1); + zmm_res.zmm = _mm512_maskz_load_ps(k16, &fsrc1.zmm); + + zmm_res.zmm = _mm512_set1_ps(1.0f); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(fsrc_arr[0]); + zmm_res.zmm = _mm512_maskz_loadu_ps(k16, &fsrc_arr[0].f32[5]); +} + +void NOINLINE do_store_and_storeu_ps() { + V512 xmm_res[2], ymm_res[2], zmm_res[2]; + __mmask16 k16 = 0xffef; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + zmm_res[0].zmm = _mm512_set1_ps(1.0f); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(fsrc1); + _mm512_store_ps(&zmm_res[0].zmm, fsrc1.zmm); + + soft_v512_update(fsrc1); + _mm512_storeu_ps(&zmm_res[0].f32[1], fsrc1.zmm); + soft_v512_update(fsrc1); + _mm256_storeu_ps(&ymm_res[0].f32[1], fsrc1.ymm[0]); + soft_v512_update(fsrc1); + _mm_storeu_ps(&xmm_res[0].f32[1], fsrc1.xmm[0]); + + check_equal_nsf(&ymm_res[0].f32[1], &zmm_res[0].f32[1], 8, "_mm256_storeu_ps", + __LINE__); + check_equal_nsf(&xmm_res[0].f32[1], &zmm_res[0].f32[1], 4, "_mm_storeu_ps", + __LINE__); + + /* Masked. */ + + zmm_res[0].zmm = _mm512_set1_ps(1.0f); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(fsrc1); + _mm512_mask_store_ps(&zmm_res[0].zmm, k8, fsrc1.zmm); + + zmm_res[0].zmm = _mm512_set1_ps(1.0f); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(fsrc1); + _mm512_mask_storeu_ps(&zmm_res[0].f32[1], k8, fsrc1.zmm); +} + +void NOINLINE do_load_and_loadu_epi32() { + V512 xmm_res, ymm_res, zmm_res; + __mmask16 k16 = 0xffef; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + soft_v512_update(isrc1); + zmm_res.zmmi = _mm512_load_epi32(&isrc1.zmmi); + + soft_v512_update(isrc1); + ymm_res.zmmi = _mm512_load_si512(&isrc1.zmmi); + check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_load_si512", __LINE__); + + soft_v512_update(isrc_arr[0]); + zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s32[1]); + + /* Masked. */ + + zmm_res.zmmi = _mm512_setzero_epi32(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc1); + zmm_res.zmmi = _mm512_mask_load_epi32(zmm_res.zmmi, k16, &isrc1.zmmi); + + zmm_res.zmmi = _mm512_setzero_epi32(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc_arr[0]); + zmm_res.zmmi = + _mm512_mask_loadu_epi32(zmm_res.zmmi, k16, &isrc_arr[0].s32[3]); + + /* Zero-masked. */ + + zmm_res.zmmi = _mm512_set1_epi32(-7); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc1); + zmm_res.zmmi = _mm512_maskz_load_epi32(k16, &isrc1.zmmi); + + zmm_res.zmmi = _mm512_set1_epi32(11); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc_arr[0]); + zmm_res.zmmi = _mm512_maskz_loadu_epi32(k16, &isrc_arr[0].s32[1]); +} + +void NOINLINE do_store_and_storeu_epi32() { + V512 xmm_res[2], ymm_res[2], zmm_res[2]; + __mmask16 k16 = 0xfeff; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + zmm_res[0].zmmi = _mm512_set1_epi32(-101); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_store_epi32(&zmm_res[0].zmmi, isrc1.zmmi); + soft_v512_update(isrc1); + _mm256_store_si256((__m256i *)&ymm_res[0].s32[0], isrc1.ymmi[0]); + soft_v512_update(isrc1); + _mm_store_si128((__m128i *)&xmm_res[0].s32[0], isrc1.xmmi[0]); + + check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_store_epi32", __LINE__); + check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_store_epi32", __LINE__); + + soft_v512_update(isrc1); + ymm_res[0].zmmi = _mm512_setzero_si512(); + _mm512_store_si512(&ymm_res[0].zmmi, isrc1.zmmi); + check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_store_si512", __LINE__); + + soft_v512_update(isrc1); + _mm512_storeu_si512(&zmm_res[0].s32[1], isrc1.zmmi); + soft_v512_update(isrc1); + _mm256_storeu_si256((__m256i *)&ymm_res[0].s32[1], isrc1.ymmi[0]); + soft_v512_update(isrc1); + _mm_storeu_si128((__m128i *)&xmm_res[0].s32[1], isrc1.xmmi[0]); + + check_equal_nd(&ymm_res[0].s32[1], &zmm_res[0].s32[1], 8, + "_mm256_storeu_si256", __LINE__); + check_equal_nd(&xmm_res[0].s32[1], &zmm_res[0].s32[1], 4, "_mm_storeu_si128", + __LINE__); + + /* Masked. */ + + zmm_res[0].zmmi = _mm512_set1_epi32(999); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_mask_store_epi32(&zmm_res[0].zmmi, k16, isrc1.zmmi); + + zmm_res[0].zmmi = _mm512_set1_epi32(-3); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_mask_storeu_epi32(&zmm_res[0].s32[1], k16, isrc1.zmmi); +} + +void NOINLINE do_load_and_loadu_epi64() { + V512 xmm_res, ymm_res, zmm_res; + __mmask16 k8 = 0xef; + + /* Non-masked. */ + + soft_v512_update(isrc1); + zmm_res.zmmi = _mm512_load_epi64(&isrc1.zmmi); + + soft_v512_update(isrc_arr[0]); + zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s64[1]); + + /* Masked. */ + + zmm_res.zmmi = _mm512_setzero_epi32(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc1); + zmm_res.zmmi = _mm512_mask_load_epi64(zmm_res.zmmi, k8, &isrc1.zmmi); + + zmm_res.zmmi = _mm512_setzero_epi32(); + ymm_res = zmm_res; + xmm_res = zmm_res; + + soft_v512_update(isrc_arr[0]); + zmm_res.zmmi = _mm512_mask_loadu_epi64(zmm_res.zmmi, k8, &isrc_arr[0].s64[3]); + + /* Zero-masked. */ + + zmm_res.zmmi = _mm512_set1_epi64(-7); + ymm_res = zmm_res; + xmm_res = zmm_res; +} + +void NOINLINE do_store_and_storeu_epi64() { + V512 xmm_res[2], ymm_res[2], zmm_res[2]; + __mmask8 k8 = 0xfe; + + /* Non-masked. */ + + zmm_res[0].zmmi = _mm512_set1_epi32(-101); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_store_epi64(&zmm_res[0].zmmi, isrc1.zmmi); + soft_v512_update(isrc1); + _mm256_store_si256((__m256i *)&ymm_res[0].s64[0], isrc1.ymmi[0]); + soft_v512_update(isrc1); + _mm_store_si128((__m128i *)&xmm_res[0].s64[0], isrc1.xmmi[0]); + + check_equal_nq(&ymm_res, &zmm_res, 4, "_mm256_store_epi64", __LINE__); + check_equal_nq(&xmm_res, &zmm_res, 2, "_mm_store_si256", __LINE__); + + soft_v512_update(isrc1); + _mm512_storeu_si512(&zmm_res[0].s64[1], isrc1.zmmi); + + /* Masked. */ + + zmm_res[0].zmmi = _mm512_set1_epi32(999); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_mask_store_epi64(&zmm_res[0].zmmi, k8, isrc1.zmmi); + + zmm_res[0].zmmi = _mm512_set1_epi32(-3); + zmm_res[1] = zmm_res[0]; + ymm_res[0] = zmm_res[0]; + ymm_res[1] = zmm_res[0]; + xmm_res[0] = zmm_res[0]; + xmm_res[1] = zmm_res[0]; + + soft_v512_update(isrc1); + _mm512_mask_storeu_epi64(&zmm_res[0].s64[1], k8, isrc1.zmmi); +} + +int main() { + init(); + + do_load_and_loadu_pd(); + do_load_and_loadu_ps(); + do_load_and_loadu_epi32(); + do_store_and_storeu_epi32(); + do_load_and_loadu_epi64(); + + do_store_and_storeu_pd(); + do_store_and_storeu_ps(); + + do_store_and_storeu_epi64(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/mask_mov.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/mask_mov.c +++ SingleSource/UnitTests/Vector/AVX512F/mask_mov.c @@ -0,0 +1,135 @@ +/* + * Test mask_mov and maskz_mov intructions + */ + +#include "m512_test_util.h" +#include + +__m512i i1; +__m512i i2; +__m512i i3; +__m512i i4; + +__m512 f1; +__m512 f2; +__m512 f3; +__m512 f4; + +__m512d d1; +__m512d d2; +__m512d d3; +__m512d d4; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE set_nonzero(void *vp, int c) { + int i; + V512 *v = (V512 *)vp; + + for (i = 0; i < 16; i++) { + v->u32[i] = 10 * i * i - 3 * i + c + vol; + if (v->u32[i] == 0) { + v->u32[i] = 1234; + } + } +} + +void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 16; i++) { + int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i]; + if (got->s32[i] != ans) { + printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n", + banner ? banner : "", got->s32[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 8; i++) { + __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i]; + if (got->s64[i] != ans) { + printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64 + " at element [%d]\n", + banner ? banner : "", got->s64[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE do_mask_mov_32() { + int k = 0xf2f3; + __m512i zeroi = _mm512_setzero_epi32(); + + set_nonzero(&i1, -97); + set_nonzero(&i2, 22); + set_nonzero(&i3, 22); + + i3 = _mm512_mask_mov_epi32(i1, k, i2); + check_equal32(&i3, &i2, &i1, k, "_mm512_mask_mov_epi32"); + + i4 = _mm512_maskz_mov_epi32(k, i1); + check_equal32(&i4, &i1, &zeroi, k, "_mm512_maskz_mov_epi32"); + + set_nonzero(&f1, -96); + set_nonzero(&f2, 21); + set_nonzero(&f3, 1400); + + f3 = _mm512_mask_mov_ps(f1, k, f2); + check_equal32(&f3, &f2, &f1, k, "_mm512_mask_mov_ps"); + + f4 = _mm512_maskz_mov_ps(k, f1); + check_equal32(&f4, &f1, &zeroi, k, "_mm512_maskz_mov_ps"); +} + +void NOINLINE do_mask_mov_64() { + __mmask8 k = 0x59; + __m512i zeroi = _mm512_setzero_epi32(); + + set_nonzero(&i1, -97); + set_nonzero(&i2, 22); + set_nonzero(&i3, 22); + + i3 = _mm512_mask_mov_epi64(i1, k, i2); + check_equal64(&i3, &i2, &i1, k, "_mm512_mask_mov_epi64"); + + i4 = _mm512_maskz_mov_epi64(k, i1); + check_equal64(&i4, &i1, &zeroi, k, "_mm512_maskz_mov_epi64"); + + set_nonzero(&d1, -96); + set_nonzero(&d2, 21); + set_nonzero(&d3, 1400); + + d3 = _mm512_mask_mov_pd(d1, k, d2); + check_equal64(&d3, &d2, &d1, k, "_mm512_mask_mov_pd"); + + d4 = _mm512_maskz_mov_pd(k, d1); + check_equal64(&d4, &d1, &zeroi, k, "_mm512_maskz_mov_pd"); +} + +int main() { + do_mask_mov_32(); + do_mask_mov_64(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/movedup.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/movedup.c +++ SingleSource/UnitTests/Vector/AVX512F/movedup.c @@ -0,0 +1,213 @@ +/* + * Test movedup and moveldup instructions. + * Here we check for _mm512_[mask|maskz]move[l|h]dup intrinsics + */ + +#include "m512_test_util.h" +#include + +int verbose = 0; + +__m512 f1; +__m512 f2; +__m512 f3; +__m512 f4; + +__m512d d1; +__m512d d2; +__m512d d3; +__m512d d4; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE init() { + int i; + V512 *pf1 = (V512 *)&f1; + V512 *pf2 = (V512 *)&f2; + V512 *pd1 = (V512 *)&d1; + V512 *pd2 = (V512 *)&d2; + + for (i = 0; i < 16; i++) { + pf1->f32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol; + pf2->f32[i] = -(100 + ((i & 3) == 3 ? 1 : -1) * i + vol); + } + + for (i = 0; i < 8; i++) { + pd1->f64[i] = pf1->f32[i]; + pd2->f64[i] = -pf2->f32[i]; + } +} + +void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 16; i++) { + int ans = (mask & (1 << i)) ? expected->u32[i] : orig->u32[i]; + if (got->u32[i] != ans) { + printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n", + banner ? banner : "", got->u32[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 8; i++) { + __int64 ans = (mask & (1 << i)) ? expected->u64[i] : orig->u64[i]; + if (got->u64[i] != ans) { + printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64 + " at element [%d]\n", + banner ? banner : "", got->u64[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE emulate_movedup_pd(void *presult, const void *p1, int mask, + const void *p2, int zero_masking) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = zero_masking ? 0 : v1->u64[i]; + } else { + int src_index = i & 0xfe; // even index + result->u64[i] = v2->u64[src_index]; + } + } +} + +void NOINLINE emulate_moveldup_ps(void *presult, const void *p1, int mask, + const void *p2, int zero_masking) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = zero_masking ? 0 : v1->u32[i]; + } else { + int src_index = i & 0xfe; // even index + result->u32[i] = v2->u32[src_index]; + } + } +} + +void NOINLINE emulate_movehdup_ps(void *presult, const void *p1, int mask, + const void *p2, int zero_masking) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = zero_masking ? 0 : v1->u32[i]; + } else { + int src_index = (i & 0xfe) + 1; // odd index + result->u32[i] = v2->u32[src_index]; + } + } +} + +void NOINLINE do_movedup_pd() { + if (verbose) { + printf("BEGIN do_movedup_pd\n"); + } + + d3 = _mm512_movedup_pd(d2); + emulate_movedup_pd(&d4, (void *)0, 0xff, &d2, 0); + check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_movedup_pd"); + + d3 = _mm512_maskz_movedup_pd(0xc5, d2); + emulate_movedup_pd(&d4, (void *)0, 0xc5, &d2, 1); + check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_maskz_movedup_pd"); + + d3 = _mm512_mask_movedup_pd(d1, 0xda, d2); + emulate_movedup_pd(&d4, &d1, 0xda, &d2, 0); + check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_mask_movedup_pd"); + + if (verbose) { + printf("DONE\n"); + } +} + +void NOINLINE do_moveldup_ps() { + if (verbose) { + printf("BEGIN do_moveldup_ps\n"); + } + + f3 = _mm512_moveldup_ps(f2); + emulate_moveldup_ps(&f4, (void *)0, 0xffff, &f2, 0); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_moveldup_ps"); + + f3 = _mm512_maskz_moveldup_ps(0x79fa, f2); + emulate_moveldup_ps(&f4, (void *)0, 0x79fa, &f2, 1); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_maskz_moveldup_ps"); + + f3 = _mm512_mask_moveldup_ps(f1, 0x53da, f2); + emulate_moveldup_ps(&f4, &f1, 0x53da, &f2, 0); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_mask_moveldup_ps"); + + if (verbose) { + printf("DONE\n"); + } +} + +void NOINLINE do_movehdup_ps() { + if (verbose) { + printf("BEGIN do_movehdup_ps\n"); + } + + f3 = _mm512_movehdup_ps(f2); + emulate_movehdup_ps(&f4, (void *)0, 0xffff, &f2, 0); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_movehdup_ps"); + + f3 = _mm512_maskz_movehdup_ps(0x79fa, f2); + emulate_movehdup_ps(&f4, (void *)0, 0x79fa, &f2, 1); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_maskz_movehdup_ps"); + + f3 = _mm512_mask_movehdup_ps(f1, 0x79fa, f2); + emulate_movehdup_ps(&f4, &f1, 0x79fa, &f2, 0); + check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_mask_movehdup_ps"); + + if (verbose) { + printf("DONE\n"); + } +} + +int main(int argc, char *argv[]) { + init(); + + do_movedup_pd(); + + do_moveldup_ps(); + do_movehdup_ps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/store.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/store.c +++ SingleSource/UnitTests/Vector/AVX512F/store.c @@ -0,0 +1,144 @@ +/* + * Test store instructions. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_store_ps() + * _mm512_mask_store_ps() + * _mm512_store_epi32() + * _mm512_mask_store_epi32() + * _mm512_store_epi64() + * _mm512_mask_store_epi64() + * _mm512_store_pd() + * _mm512_mask_store_pd() + */ + +#include "m512_test_util.h" +#include + +__m512 v1; +__m512i i1; +__m512d d1; +V512 tval; +int ALIGNTO(64) dest_memory[16]; +unsigned int et_memory[16]; +void *pdst = (void *)&dest_memory; + +unsigned int initial = 0; +void NOINLINE set_nonzero(void *vp, int c) { + int i; + V512 *v = (V512 *)vp; + + for (i = 0; i < 16; i++) { + v->u32[i] = 10 * i * i - 3 * i + c; + if (v->u32[i] == 0) { + v->u32[i] = 1234; + } + et_memory[i] = initial; + dest_memory[i] = initial; + } +} + +void NOINLINE set_generic32_result(void *vp1, int mask, int subset) { + int i, max; + V512 *v1 = (V512 *)vp1; + + max = (subset == 1 ? 1 : (subset == 2 ? 4 : 16)); + for (i = 0; i < max; i++) { + if ((mask & 0x1) != 0) { + et_memory[i] = v1->u32[i]; + } + mask >>= 1; + } +} + +void NOINLINE set_generic64_result(void *vp1, int mask, int subset) { + int i, max; + V512 *v1 = (V512 *)vp1; + + max = (subset == 1 ? 1 : (subset == 2 ? 4 : 8)); + for (i = 0; i < max; i++) { + if ((mask & 0x1) != 0) { + ((U64 *)&et_memory)[i] = v1->u64[i]; + } + mask >>= 1; + } +} + +void NOINLINE do_store_ps() { + /* full vector */ + set_nonzero(&v1, 11); + set_generic32_result(&v1, 0xffff, 0); + _mm512_store_ps(pdst, v1); + check_equal_nd(pdst, &et_memory, 16, "_mm512_store_ps - full vector", + __LINE__); + + /* vector with write mask */ + set_nonzero(&v1, 5); + set_generic32_result(&v1, 0xf00f, 0); + _mm512_mask_store_ps(pdst, 0xf00f, v1); + check_equal_nd(pdst, &et_memory, 16, "_mm512_mask_store_ps - full vector", + __LINE__); +} + +void NOINLINE do_store_epi32() { + /* full vector */ + set_nonzero(&i1, 11); + set_generic32_result(&i1, 0xffff, 0); + _mm512_store_epi32(pdst, i1); + check_equal_nd(pdst, &et_memory, 16, "_mm512_store_epi32 - full vector", + __LINE__); + + /* vector with write mask */ + set_nonzero(&i1, 5); + set_generic32_result(&i1, 0xf00f, 0); + _mm512_mask_store_epi32(pdst, 0xf00f, i1); + check_equal_nd(pdst, &et_memory, 16, "_mm512_mask_store_epi32 - full vector", + __LINE__); +} + +void NOINLINE do_store_epi64() { + /* full vector */ + set_nonzero(&i1, 11); + set_generic64_result(&i1, 0xff, 0); + _mm512_store_epi64(pdst, i1); + check_equal_nq(pdst, &et_memory, 8, "_mm512_store_epi64 - full vector", + __LINE__); + + /* vector with write mask */ + set_nonzero(&i1, 5); + set_generic64_result(&i1, 0x60, 0); + _mm512_mask_store_epi64(pdst, 0x60, i1); + check_equal_nq(pdst, &et_memory, 8, "_mm512_mask_store_epi64 - full vector", + __LINE__); +} + +void NOINLINE do_store_pd() { + /* full vector */ + set_nonzero(&d1, 11); + set_generic64_result(&d1, 0xff, 0); + _mm512_store_pd(pdst, d1); + check_equal_nq(pdst, &et_memory, 8, "_mm512_store_pd - full vector", + __LINE__); + + /* vector with write mask */ + set_nonzero(&d1, 5); + set_generic64_result(&d1, 0xf1, 0); + _mm512_mask_store_pd(pdst, 0xf1, d1); + check_equal_nq(pdst, &et_memory, 8, "_mm512_mask_store_pd - full vector", + __LINE__); +} + +int main() { + do_store_ps(); + do_store_epi32(); + do_store_epi64(); + do_store_pd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/store.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/store.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/store.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512VL-") Index: SingleSource/UnitTests/Vector/AVX512VL/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/Makefile +++ SingleSource/UnitTests/Vector/AVX512VL/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512VL/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512vl -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512vl +LCCFLAGS += -march=native -mavx512vl Index: SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c +++ SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c @@ -0,0 +1,135 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst128_f[NUM]; +float dst256_f[NUM]; +float dst_f[NUM]; +int dst128_i[NUM]; +int dst256_i[NUM]; +int dst_i[NUM]; +float src_f[NUM]; +int src_i[NUM]; +int mask128[NUM / 4]; +int mask256[NUM / 8]; +int g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst128_i[i] = dst256_i[i] = -i; + dst128_f[i] = dst256_f[i] = -i; + + if (i % 4 == 0) { + mask128[i / 4] = (i * 77) & 0xf; + if (i % 8 == 0) { + mask256[i / 8] = (i * 31) & 0xff; + } + } + } +} + +void do_mm_mmask_i32gather_epi32() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i)); + __m128i gtr = + _mm_mmask_i32gather_epi32(old_dst, mask128[i / 4], ind, src_i, SCALE); + _mm_storeu_si128((__m128i *)(dst128_i + i), gtr); + } +} + +void do_mm_mmask_i32gather_ps() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128 old_dst = _mm_loadu_ps(dst_f + i); + __m128 gtr = + _mm_mmask_i32gather_ps(old_dst, mask128[i / 4], ind, src_f, SCALE); + _mm_storeu_ps(dst128_f + i, gtr); + } +} + +void do_mm256_mmask_i32gather_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i)); + __m256i gtr = _mm256_mmask_i32gather_epi32(old_dst, mask256[i / 8], ind, + src_i, SCALE); + _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr); + } +} + +void do_mm256_mmask_i32gather_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256 old_dst = _mm256_loadu_ps(dst_f + i); + __m256 gtr = + _mm256_mmask_i32gather_ps(old_dst, mask256[i / 8], ind, src_f, SCALE); + _mm256_storeu_ps(dst256_f + i, gtr); + } +} + +int check(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mmask_i32gather_epi32(); + error |= check(1, dst128_i, dst_i, mask128, src_i, 4); + + do_mm_mmask_i32gather_ps(); + error |= check(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 4); + + do_mm256_mmask_i32gather_epi32(); + error |= check(3, dst256_i, dst_i, mask256, src_i, 8); + + do_mm256_mmask_i32gather_ps(); + error |= check(4, (int *)dst256_f, (int *)dst_f, mask256, (int *)src_f, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c +++ SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c @@ -0,0 +1,143 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi64() + * _mm256_mmask_i32gather_epi64() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst128_f[NUM]; +double dst256_f[NUM]; +double dst_f[NUM]; +__int64 dst128_i[NUM]; +__int64 dst256_i[NUM]; +__int64 dst_i[NUM]; +double src_f[NUM]; +__int64 src_i[NUM]; +__int64 mask128[NUM / 2]; +__int64 mask256[NUM / 4]; +int g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst128_i[i] = dst256_i[i] = -i; + dst128_f[i] = dst256_f[i] = -i; + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + } +} + +void do_mm_mmask_i32gather_epi64() { + int i; + for (i = 0; i < NUM; i += 2) { + + // Only the low 2 int32 elements should be used. + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i)); + __m128i gtr = + _mm_mmask_i32gather_epi64(old_dst, mask128[i / 2], ind, src_i, SCALE); + _mm_storeu_si128((__m128i *)(dst128_i + i), gtr); + } +} + +void do_mm_mmask_i32gather_pd() { + int i; + for (i = 0; i < NUM; i += 2) { + + // Only the low 2 int32 elements should be used. + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + __m128d old_dst = _mm_loadu_pd(dst_f + i); + __m128d gtr = + _mm_mmask_i32gather_pd(old_dst, mask128[i / 2], ind, src_f, SCALE); + _mm_storeu_pd(dst128_f + i, gtr); + } +} + +void do_mm256_mmask_i32gather_epi64() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i)); + __m256i gtr = _mm256_mmask_i32gather_epi64(old_dst, mask256[i / 4], ind, + src_i, SCALE); + _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr); + } +} + +void do_mm256_mmask_i32gather_pd() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m256d old_dst = _mm256_loadu_pd(dst_f + i); + __m256d gtr = + _mm256_mmask_i32gather_pd(old_dst, mask256[i / 4], ind, src_f, SCALE); + _mm256_storeu_pd(dst256_f + i, gtr); + } +} + +int check(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask, + __int64 *src, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + __int64 kmask = mask[i / elems_in_vector]; + __int64 kmask_bit = kmask & (1 << (i % elems_in_vector)); + + __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %lld, actual %lld\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mmask_i32gather_epi64(); + error |= check(1, dst128_i, dst_i, mask128, src_i, 2); + + do_mm_mmask_i32gather_pd(); + error |= check(2, (__int64 *)dst128_f, (__int64 *)dst_f, mask128, + (__int64 *)src_f, 2); + + do_mm256_mmask_i32gather_epi64(); + error |= check(3, dst256_i, dst_i, mask256, src_i, 4); + + do_mm256_mmask_i32gather_pd(); + error |= check(4, (__int64 *)dst256_f, (__int64 *)dst_f, mask256, + (__int64 *)src_f, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c +++ SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c @@ -0,0 +1,136 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst128_f[NUM], gold_dst128_f[NUM]; +float dst256_f[NUM], gold_dst256_f[NUM]; +int dst128_i[NUM], gold_dst128_i[NUM]; +int dst256_i[NUM], gold_dst256_i[NUM]; +int mask128[NUM / 4]; +int mask256[NUM / 8]; +int g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst128_i[i] = dst256_i[i] = -1; + dst128_f[i] = dst256_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 4 == 0) { + mask128[i / 4] = (i * 77) & 0xf; + if (i % 8 == 0) { + mask256[i / 8] = (i * 31) & 0xff; + } + } + + if ((mask128[i / 4] >> (i % 4)) & 0x1) { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i; + } else { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1; + } + + if ((mask256[i / 8] >> (i % 8)) & 0x1) { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i; + } else { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1; + } + } +} + +void do_mm_mask_i32scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128i val = _mm_set_epi32(i + 3, i + 2, i + 1, i); + _mm_mask_i32scatter_epi32(dst128_i, mask128[i / 4], ind, val, SCALE); + } +} + +void do_mm_mask_i32scatter_ps() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128 val = _mm_set_ps(i + 3, i + 2, i + 1, i); + _mm_mask_i32scatter_ps(dst128_f, mask128[i / 4], ind, val, SCALE); + } +} + +void do_mm256_mask_i32scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256i val = + _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm256_mask_i32scatter_epi32(dst256_i, mask256[i / 8], ind, val, SCALE); + } +} + +void do_mm256_mask_i32scatter_ps() { + int i; + for (i = 0; i < NUM; i += 8) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256 val = + _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i); + _mm256_mask_i32scatter_ps(dst256_f, mask256[i / 8], ind, val, SCALE); + } +} + +int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mask_i32scatter_epi32(); + error |= check(1, dst128_i, gold_dst128_i, mask128, 4); + + do_mm_mask_i32scatter_ps(); + error |= check(2, (int *)dst128_f, (int *)gold_dst128_f, mask128, 4); + + do_mm256_mask_i32scatter_epi32(); + error |= check(3, dst256_i, gold_dst256_i, mask256, 8); + + do_mm256_mask_i32scatter_ps(); + error |= check(4, (int *)dst256_f, (int *)gold_dst256_f, mask256, 8); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c +++ SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c @@ -0,0 +1,141 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst128_f[NUM], gold_dst128_f[NUM]; +double dst256_f[NUM], gold_dst256_f[NUM]; +__int64 dst128_i[NUM], gold_dst128_i[NUM]; +__int64 dst256_i[NUM], gold_dst256_i[NUM]; +int mask128[NUM / 2]; +int mask256[NUM / 4]; +int g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst128_i[i] = dst256_i[i] = -1; + dst128_f[i] = dst256_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + + if ((mask128[i / 2] >> (i % 2)) & 0x1) { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i; + } else { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1; + } + + if ((mask256[i / 4] >> (i % 4)) & 0x1) { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i; + } else { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1; + } + } +} + +void do_mm_mask_i32scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 2) { + + // Only 2 low int32 elements are going to be used. + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + __m128i val = _mm_set_epi64x(i + 1, i); + _mm_mask_i32scatter_epi64(dst128_i, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm_mask_i32scatter_pd() { + int i; + for (i = 0; i < NUM; i += 2) { + + // Only 2 low int32 elements are going to be used. + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + __m128d val = _mm_set_pd(i + 1, i); + _mm_mask_i32scatter_pd(dst128_f, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm256_mask_i32scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m256i val = _mm256_set_epi64x(i + 3, i + 2, i + 1, i); + _mm256_mask_i32scatter_epi64(dst256_i, mask256[i / 4], ind, val, SCALE); + } +} + +void do_mm256_mask_i32scatter_pd() { + int i; + for (i = 0; i < NUM; i += 4) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m256d val = _mm256_set_pd(i + 3, i + 2, i + 1, i); + _mm256_mask_i32scatter_pd(dst256_f, mask256[i / 4], ind, val, SCALE); + } +} + +int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %lld, actual %lld, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mask_i32scatter_epi64(); + error |= check(1, dst128_i, gold_dst128_i, mask128, 2); + + do_mm_mask_i32scatter_pd(); + error |= check(2, (__int64 *)dst128_f, (__int64 *)gold_dst128_f, mask128, 2); + + do_mm256_mask_i32scatter_epi64(); + error |= check(3, dst256_i, gold_dst256_i, mask256, 4); + + do_mm256_mask_i32scatter_pd(); + error |= check(4, (__int64 *)dst256_f, (__int64 *)gold_dst256_f, mask256, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c +++ SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c @@ -0,0 +1,137 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst128_f[NUM]; +float dst256_f[NUM]; +float dst_f[NUM]; +int dst128_i[NUM]; +int dst256_i[NUM]; +int dst_i[NUM]; +float src_f[NUM]; +int src_i[NUM]; +int mask128[NUM / 2]; +int mask256[NUM / 4]; +__int64 g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst128_i[i] = dst256_i[i] = -i; + dst128_f[i] = dst256_f[i] = -i; + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + } +} + +void do_mm_mmask_i64gather_epi32() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i)); + __m128i gtr = + _mm_mmask_i64gather_epi32(old_dst, mask128[i / 2], ind, src_i, SCALE); + _mm_storeu_si128((__m128i *)(dst128_i + i), gtr); + } +} + +void do_mm_mmask_i64gather_ps() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128 old_dst = _mm_loadu_ps(dst_f + i); + __m128 gtr = + _mm_mmask_i64gather_ps(old_dst, mask128[i / 2], ind, src_f, SCALE); + _mm_storeu_ps(dst128_f + i, gtr); + } +} + +void do_mm256_mmask_i64gather_epi32() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m128i old_dst = _mm_loadu_si128((const __m128i *)(dst_i + i)); + __m128i gtr = _mm256_mmask_i64gather_epi32(old_dst, mask256[i / 4], ind, + src_i, SCALE); + _mm_storeu_si128((__m128i *)(dst256_i + i), gtr); + } +} + +void do_mm256_mmask_i64gather_ps() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m128 old_dst = _mm_loadu_ps(dst_f + i); + __m128 gtr = + _mm256_mmask_i64gather_ps(old_dst, mask256[i / 4], ind, src_f, SCALE); + _mm_storeu_ps(dst256_f + i, gtr); + } +} + +int check(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + // printf("v= %d, g_index[i] = %d, src[g_index[i]]= %d\n ", v, g_index[i], + // src[g_index[i]]); + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mmask_i64gather_epi32(); + error |= check(1, dst128_i, dst_i, mask128, src_i, 2); + + do_mm_mmask_i64gather_ps(); + error |= check(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 2); + + do_mm256_mmask_i64gather_epi32(); + error |= check(3, dst256_i, dst_i, mask256, src_i, 4); + + do_mm256_mmask_i64gather_ps(); + error |= check(4, (int *)dst256_f, (int *)dst_f, mask256, (int *)src_f, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c +++ SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c @@ -0,0 +1,139 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i64gather_epi64() + * _mm256_mmask_i64gather_epi64() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst128_f[NUM]; +double dst256_f[NUM]; +double dst_f[NUM]; +__int64 dst128_i[NUM]; +__int64 dst256_i[NUM]; +__int64 dst_i[NUM]; +double src_f[NUM]; +__int64 src_i[NUM]; +__int64 mask128[NUM / 2]; +__int64 mask256[NUM / 4]; +__int64 g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + g_index[i] = MIN(i * 17 & 0xce, NUM); + src_f[g_index[i]] = src_i[g_index[i]] = i; + + dst_i[i] = dst_f[i] = -i; + dst128_i[i] = dst256_i[i] = -i; + dst128_f[i] = dst256_f[i] = -i; + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + } +} + +void do_mm_mmask_i64gather_epi64() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i)); + __m128i gtr = + _mm_mmask_i64gather_epi64(old_dst, mask128[i / 2], ind, src_i, SCALE); + _mm_storeu_si128((__m128i *)(dst128_i + i), gtr); + } +} + +void do_mm_mmask_i64gather_pd() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128d old_dst = _mm_loadu_pd(dst_f + i); + __m128d gtr = + _mm_mmask_i64gather_pd(old_dst, mask128[i / 2], ind, src_f, SCALE); + _mm_storeu_pd(dst128_f + i, gtr); + } +} + +void do_mm256_mmask_i64gather_epi64() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i)); + __m256i gtr = _mm256_mmask_i64gather_epi64(old_dst, mask256[i / 4], ind, + src_i, SCALE); + _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr); + } +} + +void do_mm256_mmask_i64gather_pd() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256d old_dst = _mm256_loadu_pd(dst_f + i); + __m256d gtr = + _mm256_mmask_i64gather_pd(old_dst, mask256[i / 4], ind, src_f, SCALE); + _mm256_storeu_pd(dst256_f + i, gtr); + } +} + +int check(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask, + __int64 *src, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + __int64 kmask = mask[i / elems_in_vector]; + __int64 kmask_bit = kmask & (1 << (i % elems_in_vector)); + + __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i]; + // printf("v= %d, g_index[i] = %d, src[g_index[i]]= %d\n ", v, g_index[i], + // src[g_index[i]]); + + if (v != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %lld, actual %lld\n", v, res_dst[i]); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mmask_i64gather_epi64(); + error |= check(1, dst128_i, dst_i, mask128, src_i, 2); + + do_mm_mmask_i64gather_pd(); + error |= check(2, (__int64 *)dst128_f, (__int64 *)dst_f, mask128, + (__int64 *)src_f, 2); + + do_mm256_mmask_i64gather_epi64(); + error |= check(3, dst256_i, dst_i, mask256, src_i, 4); + + do_mm256_mmask_i64gather_pd(); + error |= check(4, (__int64 *)dst256_f, (__int64 *)dst_f, mask256, + (__int64 *)src_f, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c +++ SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c @@ -0,0 +1,139 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 4 + +float dst128_f[NUM], gold_dst128_f[NUM]; +float dst256_f[NUM], gold_dst256_f[NUM]; +int dst128_i[NUM], gold_dst128_i[NUM]; +int dst256_i[NUM], gold_dst256_i[NUM]; +int mask128[NUM / 2]; +int mask256[NUM / 4]; +__int64 g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst128_i[i] = dst256_i[i] = -1; + dst128_f[i] = dst256_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + + if ((mask128[i / 2] >> (i % 2)) & 0x1) { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i; + } else { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1; + } + + if ((mask256[i / 4] >> (i % 4)) & 0x1) { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i; + } else { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1; + } + } +} + +void do_mm_mask_i64scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + // Only the low 2 int32 values are going to be used. + __m128i val = _mm_set_epi32(0, 0, i + 1, i); + + _mm_mask_i64scatter_epi32(dst128_i, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm_mask_i64scatter_ps() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + + // Only the low 2 int32 values are going to be used. + __m128 val = _mm_set_ps(0, 0, i + 1, i); + + _mm_mask_i64scatter_ps(dst128_f, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm256_mask_i64scatter_epi32() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m128i val = _mm_set_epi32(i + 3, i + 2, i + 1, i); + _mm256_mask_i64scatter_epi32(dst256_i, mask256[i / 4], ind, val, SCALE); + } +} + +void do_mm256_mask_i64scatter_ps() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m128 val = _mm_set_ps(i + 3, i + 2, i + 1, i); + _mm256_mask_i64scatter_ps(dst256_f, mask256[i / 4], ind, val, SCALE); + } +} + +int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mask_i64scatter_epi32(); + error |= check(1, dst128_i, gold_dst128_i, mask128, 2); + + do_mm_mask_i64scatter_ps(); + error |= check(2, (int *)dst128_f, (int *)gold_dst128_f, mask128, 2); + + do_mm256_mask_i64scatter_epi32(); + error |= check(3, dst256_i, gold_dst256_i, mask256, 4); + + do_mm256_mask_i64scatter_ps(); + error |= check(4, (int *)dst256_f, (int *)gold_dst256_f, mask256, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c +++ SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c @@ -0,0 +1,135 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_mmask_i32gather_epi32() + * _mm256_mmask_i32gather_epi32() + */ + +#include +#include + +#define NUM (256 * 256) +#define SCALE 8 + +double dst128_f[NUM], gold_dst128_f[NUM]; +double dst256_f[NUM], gold_dst256_f[NUM]; +__int64 dst128_i[NUM], gold_dst128_i[NUM]; +__int64 dst256_i[NUM], gold_dst256_i[NUM]; +int mask128[NUM / 2]; +int mask256[NUM / 4]; +__int64 g_index[NUM]; + +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) + +void init_data() { + int i; + for (i = 0; i < NUM; i++) { + dst128_i[i] = dst256_i[i] = -1; + dst128_f[i] = dst256_f[i] = -1; + + g_index[i] = i * 2; + if (g_index[i] >= NUM) { + g_index[i] = NUM - 1 - (i - NUM / 2) * 2; + } + + if (i % 2 == 0) { + mask128[i / 2] = (i * 77) & 0xf; + if (i % 4 == 0) { + mask256[i / 4] = (i * 31) & 0xff; + } + } + + if ((mask128[i / 2] >> (i % 2)) & 0x1) { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i; + } else { + gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1; + } + + if ((mask256[i / 4] >> (i % 4)) & 0x1) { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i; + } else { + gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1; + } + } +} + +void do_mm_mask_i64scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128i val = _mm_set_epi64x(i + 1, i); + _mm_mask_i64scatter_epi64(dst128_i, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm_mask_i64scatter_pd() { + int i; + for (i = 0; i < NUM; i += 2) { + __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i)); + __m128d val = _mm_set_pd(i + 1, i); + _mm_mask_i64scatter_pd(dst128_f, mask128[i / 2], ind, val, SCALE); + } +} + +void do_mm256_mask_i64scatter_epi64() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256i val = _mm256_set_epi64x(i + 3, i + 2, i + 1, i); + _mm256_mask_i64scatter_epi64(dst256_i, mask256[i / 4], ind, val, SCALE); + } +} + +void do_mm256_mask_i64scatter_pd() { + int i; + for (i = 0; i < NUM; i += 4) { + __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i)); + __m256d val = _mm256_set_pd(i + 3, i + 2, i + 1, i); + _mm256_mask_i64scatter_pd(dst256_f, mask256[i / 4], ind, val, SCALE); + } +} + +int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask, + int elems_in_vector) { + int i; + for (i = 0; i < NUM; i++) { + int kmask = mask[i / elems_in_vector]; + int kmask_bit = kmask & (1 << (i % elems_in_vector)); + + if (gold_dst[i] != res_dst[i]) { + printf("The testcase #%d FAILed at %d iteration\n", id, i); + + printf("Expected value %lld, actual %lld, kmask=%d\n", gold_dst[i], + res_dst[i], kmask_bit); + + return -1; + } + } + return 0; +} + +int main() { + int error = 0; + + init_data(); + + do_mm_mask_i64scatter_epi64(); + error |= check(1, dst128_i, gold_dst128_i, mask128, 2); + + do_mm_mask_i64scatter_pd(); + error |= check(2, (__int64 *)dst128_f, (__int64 *)gold_dst128_f, mask128, 2); + + do_mm256_mask_i64scatter_epi64(); + error |= check(3, dst256_i, gold_dst256_i, mask256, 4); + + do_mm256_mask_i64scatter_pd(); + error |= check(4, (__int64 *)dst256_f, (__int64 *)gold_dst256_f, mask256, 4); + + if (error != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output +++ SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/CMakeLists.txt +++ SingleSource/UnitTests/Vector/CMakeLists.txt @@ -17,8 +17,11 @@ if(ARCH STREQUAL "x86") if(X86CPU_ARCH STREQUAL "skylake-avx512") add_subdirectory(AVX512F) + add_subdirectory(AVX512BW) add_subdirectory(AVX512BWVL) add_subdirectory(AVX512DQ) + add_subdirectory(AVX512DQVL) + add_subdirectory(AVX512VL) endif() if(X86CPU_ARCH STREQUAL "knl") add_subdirectory(AVX512F) Index: SingleSource/UnitTests/Vector/Makefile =================================================================== --- SingleSource/UnitTests/Vector/Makefile +++ SingleSource/UnitTests/Vector/Makefile @@ -17,13 +17,26 @@ endif ifeq ($(CC_UNDER_TEST_IS_CLANG), 1) + ifeq ($(HAVE_X86_AVX512F_INSTRUCTIONS), 1) DIRS += AVX512F endif + +ifeq ($(HAVE_X86_AVX512VL_INSTRUCTIONS), 1) +DIRS += AVX512VL ifeq ($(HAVE_X86_AVX512BW_INSTRUCTIONS), 1) DIRS += AVX512BWVL endif ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1) +DIRS += AVX512DQVL +endif +endif + +ifeq ($(HAVE_X86_AVX512BW_INSTRUCTIONS), 1) +DIRS += AVX512BW +endif + +ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1) DIRS += AVX512DQ endif endif