Index: SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512BWVL/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512BWVL-") Index: SingleSource/UnitTests/Vector/AVX512BWVL/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/Makefile +++ SingleSource/UnitTests/Vector/AVX512BWVL/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512BWVL/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512bw -mavx512vl -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512bw -mavx512vl +LCCFLAGS += -march=native -mavx512bw -mavx512vl Index: SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/permutes.c @@ -0,0 +1,1014 @@ +/* + * Test pertumes and shuffle intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm*_permute*_pd() + * _mm*_shuffle_epi8() + */ + +#include "m512_test_util.h" + +volatile int vol0 = 0; + +V512 i8; +V512 i8_mix; +V512 i8_big; +V512 i16; +V512 i16_mix; +V512 i16_big; +V512 i32; +V512 i32_mix; +V512 i32_big; +V512 i64; +V512 i64_mix; +V512 i64_big; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = i; + i8_mix.s8[i] = (i & 1) ? i : -i; + i8_big.s8[i] = 1000 * (i + 1); + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = i; + i16_mix.s16[i] = (i & 1) ? i : -i; + i16_big.s16[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i16_big.s16[i] = -i16_big.s16[i]; + } + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_mix.s32[i] = (i & 1) ? i : -i; + i32_big.s32[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i32_big.s32[i] = -i32_big.s32[i]; + } + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + i64_big.s64[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i64_big.s64[i] = -i64_big.s64[i]; + } + } +} + +#define CHECK_PSHUFB(n_elems, dest, mask, zeroing, name) \ + { \ + int i, lane; \ + for (i = 0; i < n_elems; i++) { \ + expected.s8[i] = 0; \ + if (i8_mix.s8[i] >= 0) { \ + lane = i / 16; \ + expected.s8[i] = i8.s8[16 * lane + (i8_mix.s8[i] & 0xf)]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s8[i] = 0; \ + } else { \ + expected.s8[i] = dest.s8[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems / 4, name, __LINE__); \ + i8_mix.xmmi[vol0] = i8_mix.xmmi[vol0]; \ + } + +void NOINLINE do_pshufb() { + V512 res; + V512 expected; + __mmask64 k64 = 0xFFFFFFFFFFFFFFFFLL; + + /* Non-masked versions. */ + res.xmmi[vol0] = _mm_shuffle_epi8(i8.xmmi[vol0], i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 0, "_mm_shuffle_epi8"); + + res.ymmi[vol0] = _mm256_shuffle_epi8(i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 0, "_mm256_shuffle_epi8"); + + res.zmmi = _mm512_shuffle_epi8(i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 0, "_mm512_shuffle_epi8"); + + /* Masked versions. */ + k64 = 0xA4A4A4A4A4A4A4A4LL; + res.xmmi[vol0] = _mm_mask_shuffle_epi8(i8_big.xmmi[vol0], k64, i8.xmmi[vol0], + i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 0, "_mm_mask_shuffle_epi8"); + + res.ymmi[vol0] = _mm256_mask_shuffle_epi8(i8_big.ymmi[vol0], k64, + i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 0, "_mm256_mask_shuffle_epi8"); + + res.zmmi = _mm512_mask_shuffle_epi8(i8_big.zmmi, k64, i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 0, "_mm512_mask_shuffle_epi8"); + + /* Zero-masked versions. */ + k64 = 0x4A4A4A4A4A4A4A4ALL; + res.xmmi[vol0] = + _mm_maskz_shuffle_epi8(k64, i8.xmmi[vol0], i8_mix.xmmi[vol0]); + CHECK_PSHUFB(16, i8_big, k64, 1, "_mm_maskz_shuffle_epi8"); + + res.ymmi[vol0] = + _mm256_maskz_shuffle_epi8(k64, i8.ymmi[vol0], i8_mix.ymmi[vol0]); + CHECK_PSHUFB(32, i8_big, k64, 1, "_mm256_maskz_shuffle_epi8"); + + res.zmmi = _mm512_maskz_shuffle_epi8(k64, i8.zmmi, i8_mix.zmmi); + CHECK_PSHUFB(64, i8_big, k64, 1, "_mm512_maskz_shuffle_epi8"); +} + +void NOINLINE do_perm_epi32() { + __mmask16 k; + + volatile __m512i v1 = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m512i z1 = v1; + __m512i z2 = v2; + __m512i z3; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m256i y2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i y3; + __m256i e2; + + z3 = _mm512_permutexvar_epi32(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_epi32", __LINE__); + + k = 0xa97e; + + y3 = y1; + y3 = _mm256_mask_permutexvar_epi32(y3, k, y2, y1); + e2 = _mm256_set_epi32(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_epi32", __LINE__); + + z3 = v1; + z3 = _mm512_mask_permutexvar_epi32(z3, k, z2, z1); + e1 = _mm512_set_epi32(0, 14, 2, 12, 4, 10, 9, 7, 7, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_epi32", __LINE__); + + y3 = _mm256_maskz_permutexvar_epi32(k, y2, y1); + e2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_epi32", __LINE__); + + z3 = _mm512_maskz_permutexvar_epi32(k, z2, z1); + e1 = _mm512_set_epi32(0, 0, 2, 0, 4, 0, 0, 7, 0, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_epi32", __LINE__); +} + +void NOINLINE do_perm_ps() { + __mmask16 k; + + volatile __m512i v1 = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = + _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m512 z1 = _mm512_castsi512_ps(v1); + __m512i z2 = v2; + __m512 z3; + __m512i e1; + volatile __m256 y1 = + _mm256_castsi256_ps(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + volatile __m256i y2 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256 y3, e2; + + y3 = _mm256_permutevar8x32_ps(y1, y2); + e2 = _mm256_castsi256_ps(y2); + check_equal_nd(&y3, &e2, 8, "_mm256_permutevar8x32_ps", __LINE__); + + y3 = _mm256_permutexvar_ps(y2, y1); + e2 = _mm256_castsi256_ps(y2); + check_equal_nd(&y3, &e2, 8, "_mm256_permutexvar_ps", __LINE__); + + z3 = _mm512_permutexvar_ps(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_ps", __LINE__); + + k = 0xa97e; + y3 = y1; + y3 = _mm256_mask_permutexvar_ps(y3, k, y2, y1); + e2 = _mm256_castsi256_ps(_mm256_set_epi32(7, 1, 2, 3, 4, 5, 6, 0)); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_ps", __LINE__); + + k = 0xa97e; + z3 = _mm512_castsi512_ps(v1); + z3 = _mm512_mask_permutexvar_ps(z3, k, z2, z1); + e1 = _mm512_set_epi32(0, 14, 2, 12, 4, 10, 9, 7, 7, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_ps", __LINE__); + + k = 0xa97e; + y3 = y1; + y3 = _mm256_maskz_permutexvar_ps(k, y2, y1); + e2 = _mm256_castsi256_ps(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 0)); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_ps", __LINE__); + + k = 0xa97e; + z3 = _mm512_castsi512_ps(v1); + z3 = _mm512_maskz_permutexvar_ps(k, z2, z1); + e1 = _mm512_set_epi32(0, 0, 2, 0, 4, 0, 0, 7, 0, 9, 10, 11, 12, 13, 14, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_ps", __LINE__); +} + +#define CHECK_PERMI_PS(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + expected.f32[i] = i32.f32[4 * (i / 4) + (i32_mix.s32[i] & 0x3)]; \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + i32_mix.ymmi[vol0] = i32_mix.ymmi[vol0]; \ + } + +#define CHECK_PERMI_PS_IMM(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + expected.f32[i] = i32.f32[4 * (i / 4) + ((imm >> ((i % 4) * 2)) & 0x3)]; \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + i32.ymmi[vol0] = i32.ymmi[vol0]; \ + } + +void NOINLINE do_permi_ps() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + char imm; + + res.xmm[vol0] = _mm_permutevar_ps(i32.xmm[vol0], i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 0, "_mm_permutevar_ps"); + res.ymm[vol0] = _mm256_permutevar_ps(i32.ymm[vol0], i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 0, "_mm256_permutevar_ps"); + res.zmm = _mm512_permutevar_ps(i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 0, "_mm512_permutevar_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_mask_permutevar_ps(i32_big.xmm[vol0], k, i32.xmm[vol0], + i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 0, "_mm_mask_permutevar_ps"); + res.ymm[vol0] = _mm256_mask_permutevar_ps(i32_big.ymm[vol0], k, i32.ymm[vol0], + i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 0, "_mm256_mask_permutevar_ps"); + res.zmm = _mm512_mask_permutevar_ps(i32_big.zmm, k, i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 0, "_mm512_mask_permutevar_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_maskz_permutevar_ps(k, i32.xmm[vol0], i32_mix.xmmi[vol0]); + CHECK_PERMI_PS(2, i32_big, k, 1, "_mm_maskz_permutevar_ps"); + res.ymm[vol0] = + _mm256_maskz_permutevar_ps(k, i32.ymm[vol0], i32_mix.ymmi[vol0]); + CHECK_PERMI_PS(4, i32_big, k, 1, "_mm256_maskz_permutevar_ps"); + res.zmm = _mm512_maskz_permutevar_ps(k, i32.zmm, i32_mix.zmmi); + CHECK_PERMI_PS(8, i32_big, k, 1, "_mm512_maskz_permutevar_ps"); + + imm = 0xA4; + k = 0xFF; + res.xmm[vol0] = _mm_permute_ps(i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 0, "_mm_permute_ps"); + res.ymm[vol0] = _mm256_permute_ps(i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 0, "_mm256_permute_ps"); + res.zmm = _mm512_permute_ps(i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 0, "_mm512_permute_pd"); + + k = 0xA4; + res.xmm[vol0] = + _mm_mask_permute_ps(i32_big.xmm[vol0], k, i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 0, "_mm_mask_permute_ps"); + res.ymm[vol0] = + _mm256_mask_permute_ps(i32_big.ymm[vol0], k, i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 0, "_mm256_mask_permute_ps"); + res.zmm = _mm512_mask_permute_ps(i32_big.zmm, k, i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 0, "_mm512_mask_permute_ps"); + + k = 0xA4; + res.xmm[vol0] = _mm_maskz_permute_ps(k, i32.xmm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(2, i32_big, k, 1, "_mm_maskz_permute_ps"); + res.ymm[vol0] = _mm256_maskz_permute_ps(k, i32.ymm[vol0], 0xA4); + CHECK_PERMI_PS_IMM(4, i32_big, k, 1, "_mm256_maskz_permute_ps"); + res.zmm = _mm512_maskz_permute_ps(k, i32.zmm, 0xA4); + CHECK_PERMI_PS_IMM(8, i32_big, k, 1, "_mm512_maskz_permute_ps"); +} + +void NOINLINE do_perm_epi64() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + __m512i z1 = v1; + __m512i z2 = v2; + __m512i z3; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi64x(3, 2, 1, 0); + volatile __m256i y2 = _mm256_set_epi64x(0, 1, 2, 3); + __m256i y3, e2; + + y3 = _mm256_permutexvar_epi64(y2, y1); + e2 = y2; + check_equal_nd(&y3, &e2, 8, "_mm256_permutexvar_epi64", __LINE__); + + z3 = _mm512_permutexvar_epi64(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_epi64", __LINE__); + + k = 0x7e; + y3 = y1; + y3 = _mm256_mask_permutexvar_epi64(y3, k, y2, y1); + e2 = _mm256_set_epi64x(0, 1, 2, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_mask_permutexvar_epi64", __LINE__); + + k = 0x7e; + z3 = v1; + z3 = _mm512_mask_permutexvar_epi64(z3, k, z2, z1); + e1 = _mm512_set_epi64(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_epi64", __LINE__); + + k = 0x7c; + y3 = y1; + y3 = _mm256_maskz_permutexvar_epi64(k, y2, y1); + e2 = _mm256_set_epi64x(0, 1, 0, 0); + check_equal_nd(&y3, &e2, 8, "_mm256_maskz_permutexvar_epi64", __LINE__); + + k = 0x7e; + z3 = v1; + z3 = _mm512_maskz_permutexvar_epi64(k, z2, z1); + e1 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_epi64", __LINE__); +} + +void NOINLINE do_perm_pd() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + volatile __m512i v2 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + __m512d z1 = _mm512_castsi512_pd(v1); + __m512i z2 = v2; + __m512d z3; + __m512i e1; + volatile __m256i yv1; + volatile __m256i yv2; + __m256d y1; + __m256i y2; + __m256d y3; + __m256i ye1; + + z3 = _mm512_permutexvar_pd(z2, z1); + check_equal_nd(&z3, &z2, 16, "_mm512_permutexvar_pd", __LINE__); + + k = 0x7e; + z3 = _mm512_castsi512_pd(v1); + z3 = _mm512_mask_permutexvar_pd(z3, k, z2, z1); + e1 = _mm512_set_epi64(7, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_mask_permutexvar_pd", __LINE__); + + z3 = _mm512_maskz_permutexvar_pd(k, z2, z1); + e1 = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 0); + check_equal_nd(&z3, &e1, 16, "_mm512_maskz_permutexvar_pd", __LINE__); + + /* 256 */ + yv1 = _mm256_set_epi64x(7, 6, 5, 4); + yv2 = _mm256_set_epi64x(4, 5, 6, 7); + y1 = _mm256_castsi256_pd(yv1); + y2 = yv2; + + y3 = _mm256_permutexvar_pd(y2, y1); + check_equal_nd(&y3, &y2, 8, "_mm256_permutexvar_pd", __LINE__); + + k = 0x6; + y3 = _mm256_castsi256_pd(yv1); + y3 = _mm256_mask_permutexvar_pd(y3, k, y2, y1); + ye1 = _mm256_set_epi64x(7, 5, 6, 4); + check_equal_nd(&y3, &ye1, 8, "_mm256_mask_permutexvar_pd", __LINE__); + + y3 = _mm256_maskz_permutexvar_pd(k, y2, y1); + ye1 = _mm256_set_epi64x(0, 5, 6, 0); + check_equal_nd(&y3, &ye1, 8, "_mm256_maskz_permutexvar_pd", __LINE__); +} + +#define CHECK_PERMI_PD(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + if ((i64_mix.s64[i] & 0x2) == 0) { \ + expected.f64[i] = i64.f64[2 * (i / 2)]; \ + } else { \ + expected.f64[i] = i64.f64[2 * (i / 2) + 1]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + i64_mix.ymmi[vol0] = i64_mix.ymmi[vol0]; \ + } + +#define CHECK_PERMI_PD_IMM(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + if (((imm >> i) & 0x1) == 0) { \ + expected.f64[i] = i64.f64[2 * (i / 2)]; \ + } else { \ + expected.f64[i] = i64.f64[2 * (i / 2) + 1]; \ + } \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + i64.ymmi[vol0] = i64.ymmi[vol0]; \ + } + +void NOINLINE do_permi_pd() { + V512 res; + V512 expected; + __mmask8 k = 0xFF; + char imm; + + res.xmmd[vol0] = _mm_permutevar_pd(i64.xmmd[vol0], i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 0, "_mm_permutevar_pd"); + res.ymmd[vol0] = _mm256_permutevar_pd(i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 0, "_mm256_permutevar_pd"); + res.zmmd = _mm512_permutevar_pd(i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 0, "_mm512_permutevar_pd"); + + k = 0xA4; + res.xmmd[vol0] = _mm_mask_permutevar_pd(i64_big.xmmd[vol0], k, i64.xmmd[vol0], + i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 0, "_mm_mask_permutevar_pd"); + res.ymmd[vol0] = _mm256_mask_permutevar_pd( + i64_big.ymmd[vol0], k, i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 0, "_mm256_mask_permutevar_pd"); + res.zmmd = _mm512_mask_permutevar_pd(i64_big.zmmd, k, i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 0, "_mm512_mask_permutevar_pd"); + + k = 0xA4; + res.xmmd[vol0] = + _mm_maskz_permutevar_pd(k, i64.xmmd[vol0], i64_mix.xmmi[vol0]); + CHECK_PERMI_PD(2, i64_big, k, 1, "_mm_maskz_permutevar_pd"); + res.ymmd[vol0] = + _mm256_maskz_permutevar_pd(k, i64.ymmd[vol0], i64_mix.ymmi[vol0]); + CHECK_PERMI_PD(4, i64_big, k, 1, "_mm256_maskz_permutevar_pd"); + res.zmmd = _mm512_maskz_permutevar_pd(k, i64.zmmd, i64_mix.zmmi); + CHECK_PERMI_PD(8, i64_big, k, 1, "_mm512_maskz_permutevar_pd"); + + imm = 0xA4; + k = 0xFF; + res.xmmd[vol0] = _mm_permute_pd(i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 0, "_mm_permute_pd"); + res.ymmd[vol0] = _mm256_permute_pd(i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 0, "_mm256_permute_pd"); + res.zmmd = _mm512_permute_pd(i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 0, "_mm512_permute_pd"); + + k = 0xA4; + res.xmmd[vol0] = + _mm_mask_permute_pd(i64_big.xmmd[vol0], k, i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 0, "_mm_mask_permute_pd"); + res.ymmd[vol0] = + _mm256_mask_permute_pd(i64_big.ymmd[vol0], k, i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 0, "_mm256_mask_permute_pd"); + res.zmmd = _mm512_mask_permute_pd(i64_big.zmmd, k, i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 0, "_mm512_mask_permute_pd"); + + k = 0xA4; + res.xmmd[vol0] = _mm_maskz_permute_pd(k, i64.xmmd[vol0], 0xA4 & 0x3); + CHECK_PERMI_PD_IMM(2, i64_big, k, 1, "_mm_maskz_permute_pd"); + res.ymmd[vol0] = _mm256_maskz_permute_pd(k, i64.ymmd[vol0], 0xA4 & 0xf); + CHECK_PERMI_PD_IMM(4, i64_big, k, 1, "_mm256_maskz_permute_pd"); + res.zmmd = _mm512_maskz_permute_pd(k, i64.zmmd, 0xA4); + CHECK_PERMI_PD_IMM(8, i64_big, k, 1, "_mm512_maskz_permute_pd"); +} + +void NOINLINE do_perm_epi64_imm() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512i z1 = v1; + __m512i z2; + __m512i e1; + volatile __m256i y1 = _mm256_set_epi64x(3, 2, 1, 0); + __m256i y2, e2; + + y2 = y1; + y2 = _mm256_permutex_epi64(y2, 0x7a); + e2 = _mm256_set_epi64x(1, 3, 2, 2); + check_equal_nd(&y2, &e2, 8, "_mm256_permutex_epi64", __LINE__); + + z2 = _mm512_permutex_epi64(z1, 0x7a); + e1 = _mm512_set_epi64(5, 7, 6, 6, 1, 3, 2, 2); + check_equal_nd(&z2, &e1, 16, "_mm512_permutex_epi64", __LINE__); + + k = 0x7e; + y2 = y1; + y2 = _mm256_mask_permutex_epi64(y2, k, y2, 0x7a); + e2 = _mm256_set_epi64x(1, 3, 2, 0); + check_equal_nd(&y2, &e2, 8, "_mm256_mask_permutex_epi64", __LINE__); + + k = 0x7e; + z2 = v1; + z2 = _mm512_mask_permutex_epi64(z2, k, z2, 0x7a); + e1 = _mm512_set_epi64(7, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_mask_permutex_epi64", __LINE__); + + k = 0x76; + y2 = y1; + y2 = _mm256_maskz_permutex_epi64(k, y2, 0x7a); + e2 = _mm256_set_epi64x(0, 3, 2, 0); + check_equal_nd(&y2, &e2, 8, "_mm256_maskz_permutex_epi64", __LINE__); + + k = 0x7e; + z2 = v1; + z2 = _mm512_maskz_permutex_epi64(k, z2, 0x7a); + e1 = _mm512_set_epi64(0, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_maskz_permutex_epi64", __LINE__); +} + +void NOINLINE do_perm_pd_imm() { + __mmask8 k; + + volatile __m512i v1 = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0); + __m512d z1 = _mm512_castsi512_pd(v1); + __m512d z2; + __m512i e1; + volatile __m256i yv1; + __m256d y1; + __m256d y2; + __m256i ye1; + + z2 = _mm512_permutex_pd(z1, 0x7a); + e1 = _mm512_set_epi64(5, 7, 6, 6, 1, 3, 2, 2); + check_equal_nd(&z2, &e1, 16, "_mm512_permutex_pd", __LINE__); + + k = 0x7e; + z2 = _mm512_castsi512_pd(v1); + z2 = _mm512_mask_permutex_pd(z2, k, z2, 0x7a); + e1 = _mm512_set_epi64(7, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_mask_permutex_pd", __LINE__); + + z2 = _mm512_castsi512_pd(v1); + z2 = _mm512_maskz_permutex_pd(k, z2, 0x7a); + e1 = _mm512_set_epi64(0, 7, 6, 6, 1, 3, 2, 0); + check_equal_nd(&z2, &e1, 16, "_mm512_maskz_permutex_pd", __LINE__); + + /* 256 */ + yv1 = _mm256_set_epi64x(7, 6, 5, 4); + y1 = _mm256_castsi256_pd(yv1); + + y2 = _mm256_permutex_pd(y1, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 6); + check_equal_nd(&y2, &ye1, 8, "_mm256_permutex_pd", __LINE__); + + k = 0x7e; + y2 = _mm256_castsi256_pd(yv1); + y2 = _mm256_mask_permutex_pd(y2, k, y2, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 4); + check_equal_nd(&y2, &ye1, 8, "_mm256_mask_permutex_pd", __LINE__); + + y2 = _mm256_castsi256_pd(yv1); + y2 = _mm256_maskz_permutex_pd(k, y2, 0xa); + ye1 = _mm256_set_epi64x(4, 4, 6, 0); + check_equal_nd(&y2, &ye1, 8, "_mm256_maskz_permutex_pd", __LINE__); +} + +void NOINLINE do_perm_ti_2w() { + V512 res; + V512 expected; + volatile int i; + __mmask32 k; + + res.zmmi = _mm512_permutex2var_epi16(i16.zmmi, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi16", __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_mask_permutex2var_epi16(i16.zmmi, k, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = i16.s16[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi16", + __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_mask2_permutex2var_epi16(i16.zmmi, i16_mix.zmmi, k, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = i16_mix.s16[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi16", + __LINE__); + + i16_big.xmmi[vol0] = i16_big.xmmi[vol0]; + + k = 0xabcdffef; + res.zmmi = + _mm512_maskz_permutex2var_epi16(k, i16.zmmi, i16_mix.zmmi, i16_big.zmmi); + for (i = 0; i < 32; i++) { + int index = i16_mix.s16[i] & 0x1f; + expected.s16[i] = + (i16_mix.s16[i] & 0x20) ? i16_big.s16[index] : i16.s16[index]; + if ((k & (1 << i)) == 0) { + expected.s16[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi16", + __LINE__); +} + +void NOINLINE do_perm_ti_2d() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k; + + res.zmmi = _mm512_permutex2var_epi32(i32.zmmi, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi32", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmmi = + _mm512_mask_permutex2var_epi32(i32.zmmi, k, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi32", + __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xdcba; + res.zmmi = + _mm512_mask2_permutex2var_epi32(i32.zmmi, i32_mix.zmmi, k, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi32", + __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmmi = + _mm512_maskz_permutex2var_epi32(k, i32.zmmi, i32_mix.zmmi, i32_big.zmmi); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi32", + __LINE__); +} + +void NOINLINE do_perm_ti_2q() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k; + + res.zmmi = _mm512_permutex2var_epi64(i64.zmmi, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_epi64", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmi = + _mm512_mask_permutex2var_epi64(i64.zmmi, k, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_epi64", + __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmi = + _mm512_mask2_permutex2var_epi64(i64.zmmi, i64_mix.zmmi, k, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64_mix.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_epi64", + __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xe7; + res.zmmi = + _mm512_maskz_permutex2var_epi64(k, i64.zmmi, i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_epi64", + __LINE__); +} + +void NOINLINE do_perm_ti_2ps() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k; + + res.zmm = _mm512_permutex2var_ps(i32.zmm, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_mask_permutex2var_ps(i32.zmm, k, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_mask2_permutex2var_ps(i32.zmm, i32_mix.zmmi, k, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_ps", __LINE__); + + i32_big.xmmi[vol0] = i32_big.xmmi[vol0]; + + k = 0xabcd; + res.zmm = _mm512_maskz_permutex2var_ps(k, i32.zmm, i32_mix.zmmi, i32_big.zmm); + for (i = 0; i < 16; i++) { + int index = i32_mix.s32[i] & 0xf; + expected.s32[i] = + (i32_mix.s32[i] & 0x10) ? i32_big.s32[index] : i32.s32[index]; + if ((k & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_ps", __LINE__); +} + +void NOINLINE do_perm_ti_2pd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k; + + res.zmmd = _mm512_permutex2var_pd(i64.zmmd, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + } + check_equal_nd(&res, &expected, 16, "_mm512_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_mask_permutex2var_pd(i64.zmmd, k, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_mask2_permutex2var_pd(i64.zmmd, i64_mix.zmmi, k, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = i64_mix.s64[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask2_permutex2var_pd", __LINE__); + + i64_big.xmmi[vol0] = i64_big.xmmi[vol0]; + + k = 0xf9; + res.zmmd = + _mm512_maskz_permutex2var_pd(k, i64.zmmd, i64_mix.zmmi, i64_big.zmmd); + for (i = 0; i < 8; i++) { + int index = i64_mix.s64[i] & 0x7; + expected.s64[i] = + (i64_mix.s64[i] & 0x8) ? i64_big.s64[index] : i64.s64[index]; + if ((k & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_permutex2var_pd", __LINE__); +} + +#define CHECK_PERMW(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.s16[i] = i16_mix.s16[i16.s16[i] & 0x1f]; \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.s16[i] = 0; \ + } else { \ + expected.s16[i] = dest.s16[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems / 2, name, __LINE__); \ + i16.xmmi[vol0] = i16.xmmi[vol0]; \ + } + +void NOINLINE do_permw() { + V512 res; + V512 expected; + __mmask32 k32 = 0xFFFFFFFF; + + res.xmmi[vol0] = _mm_permutexvar_epi16(i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 0, "_mm_permutexvar_epi16"); + + res.ymmi[vol0] = _mm256_permutexvar_epi16(i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 0, "_mm256_permutexvar_epi16"); + + res.zmmi = _mm512_permutexvar_epi16(i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 0, "_mm512_permutexvar_epi16"); + + k32 = 0xA4A4A4A4; + res.xmmi[vol0] = _mm_mask_permutexvar_epi16( + i16_big.xmmi[vol0], k32, i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 0, "_mm_mask_permutexvar_epi16"); + + res.ymmi[vol0] = _mm256_mask_permutexvar_epi16( + i16_big.ymmi[vol0], k32, i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 0, "_mm256_mask_permutexvar_epi16"); + + res.zmmi = + _mm512_mask_permutexvar_epi16(i16_big.zmmi, k32, i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 0, "_mm512_mask_permutexvar_epi16"); + + k32 = 0x4A4A4A4A; + res.xmmi[vol0] = + _mm_maskz_permutexvar_epi16(k32, i16.xmmi[vol0], i16_mix.xmmi[vol0]); + CHECK_PERMW(8, i16_big, k32, 1, "_mm_maskz_permutexvar_epi16"); + + res.ymmi[vol0] = + _mm256_maskz_permutexvar_epi16(k32, i16.ymmi[vol0], i16_mix.ymmi[vol0]); + CHECK_PERMW(16, i16_big, k32, 1, "_mm256_maskz_permutexvar_epi16"); + + res.zmmi = _mm512_maskz_permutexvar_epi16(k32, i16.zmmi, i16_mix.zmmi); + CHECK_PERMW(32, i16_big, k32, 1, "_mm512_maskz_permutexvar_epi16"); +} + +void NOINLINE do_blendmps() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x3456; + + res.zmm = _mm512_mask_blend_ps(k, i32.zmm, i32_mix.zmm); + for (i = 0; i < 16; i++) { + expected.s32[i] = i32.s32[i]; + if ((k & (1 << i)) != 0) { + expected.s32[i] = i32_mix.s32[i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_blend_ps", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_pshufb(); + + do_perm_epi32(); + do_perm_ps(); + do_permi_ps(); + + do_perm_epi64(); + do_perm_pd(); + do_permi_pd(); + + do_perm_epi64_imm(); + do_perm_pd_imm(); + + do_perm_ti_2w(); + + do_perm_ti_2d(); + do_perm_ti_2q(); + + do_perm_ti_2ps(); + do_perm_ti_2pd(); + + do_permw(); + + do_blendmps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/permutes.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.c @@ -0,0 +1,272 @@ +// +// Testcases for +// 128/256/512 bit vdbpsadbw +// intrinsics with no mask/blend mask/zero mask forms. +// +// Here we check for _mm*_dbsad_epu8 intrinsics. +// + +#include "m512_test_util.h" +#include + +#if DEBUG +#define dprintf(...) printf(__VA_ARGS__) +#define ddisplay_w(...) display_pw(__VA_ARGS__) +#define ddisplay_b(...) display_pb(__VA_ARGS__) +#else +#define dprintf(...) +#define ddisplay_w(...) +#define ddisplay_b(...) +#endif // DEBUG + +typedef int bool; +#define true 1 +#define false 0 + +#define CHECK_DBPSADBW(opcode, res_bit_size, is_masked, mask, is_zero_mask, \ + imm) \ + { \ + int fail = 0; \ + /* Compute the expected result. */ \ + expVal.zmmi = \ + compute_vdbpsadbw(&expVal, is_masked, mask, is_zero_mask, imm, \ + &bop1.zmmi, &bop2.zmmi, res_bit_size); \ + \ + /* Compare the obtained and expected results. */ \ + fail = check_equal_nw(&res, &expVal, (res_bit_size / 16), \ + is_masked ? (is_zero_mask ? opcode " zero mask" \ + : opcode " blend mask") \ + : opcode " no mask", \ + __LINE__); \ + if (fail) { \ + dprintf("\n"); \ + ddisplay_w(&wres_orig, "old:", res_bit_size / 16); \ + dprintf("\n"); \ + ddisplay_b(&bop2, "bop2:", res_bit_size / 8); \ + dprintf("\n"); \ + ddisplay_b(&bop1, "bop1:", res_bit_size / 8); \ + dprintf("\n===========================================\n"); \ + } \ + } + +#define XYDBPSADBW(opcode, res_bit_size, mmsuffix, is_masked, mask, \ + is_zero_mask, imm, xy) \ + { \ + if (is_masked) { \ + if (is_zero_mask) { \ + /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + res.xy##mmi[0] = mmsuffix##maskz_##dbsad_epu8(mask, bop1.xy##mmi[0], \ + bop2.xy##mmi[0], imm); \ + } else { \ + /* Blend masking */ \ + memcpy(&res, &wres_orig, sizeof(res)); \ + res.xy##mmi[0] = mmsuffix##mask_##dbsad_epu8( \ + res.xy##mmi[0], mask, bop1.xy##mmi[0], bop2.xy##mmi[0], imm); \ + } \ + } else { \ + /* No masking */ \ + memset(&res, 0x0, sizeof(res)); \ + res.xy##mmi[0] = \ + mmsuffix##dbsad_epu8(bop1.xy##mmi[0], bop2.xy##mmi[0], imm); \ + } \ + CHECK_DBPSADBW(opcode, res_bit_size, is_masked, mask, is_zero_mask, imm) \ + } + +#define ZDBPSADBW(opcode, is_masked, mask, is_zero_mask, imm) \ + { \ + if (is_masked) { \ + if (is_zero_mask) { /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + res.zmmi = _mm512_maskz_##dbsad_epu8(mask, bop1.zmmi, bop2.zmmi, imm); \ + } else { /* Blend masking */ \ + memcpy(&res, &wres_orig, sizeof(res)); \ + res.zmmi = _mm512_mask_##dbsad_epu8(res.zmmi, mask, bop1.zmmi, \ + bop2.zmmi, imm); \ + } \ + } else { /* No masking */ \ + memset(&res, 0x0, sizeof(res)); \ + res.zmmi = _mm512_##dbsad_epu8(bop1.zmmi, bop2.zmmi, imm); \ + } \ + CHECK_DBPSADBW(opcode, 512, is_masked, mask, is_zero_mask, imm) \ + } + +// +// Data +// + +volatile unsigned short u16_orig_arr[32] = { + 0x1000, 0x1100, 0x2200, 0x3300, 0x4400, 0x5500, 0x6600, 0x7700, + 0x8800, 0x9900, 0xaa00, 0xbb00, 0xcc00, 0xdd00, 0xee00, 0xff00, + 0x1234, 0x1111, 0x2222, 0x3333, 0x4444, 0x5555, 0x6666, 0x7777, + 0x8888, 0x9999, 0xaaaa, 0xbbbb, 0xcccc, 0xdddd, 0xeeee, 0xffff}; + +V512 bop1, bop2; +V512 res, expVal; +V512 wres_orig; + +static void NOINLINE init() { + int i; + + // i8 operand vectors + // + for (i = 0; i < 64; i++) { + bop1.s8[i] = i; + } + for (i = 63; i >= 0; i--) { + bop2.s8[63 - i] = i; + } + + // Destructed operand vectors + memcpy((void *)&wres_orig, (void *)u16_orig_arr, 64); +} + +// +// Emulate the vdbpsadbw operation. +// + +__m512i NOINLINE compute_vdbpsadbw(void *res, bool is_masked, unsigned int mask, + bool zero_mask, int imm, const void *op1, + const void *op2, int res_bit_size) { + V512 *vres = (V512 *)res; + V512 *vop1 = (V512 *)op1; + V512 *vop2 = (V512 *)op2; + V512 vtmp; + + int lanes = res_bit_size / 128; + int lane, res_i; + int elems, elem; + + dprintf("\n\n"); + + // Do unmasked vdbpsadbw to get temp result. + // + for (lane = 0; lane < lanes; lane++) { + + dprintf("\n"); + for (elem = 0; elem < 4; elem++) { + int op_i; + + res_i = lane * 4 + elem; + op_i = lane * 4 + ((imm >> (2 * elem)) & 0x3); + vtmp.u32[res_i] = vop2->u32[op_i]; + + dprintf("l,e %d:%d, tmp[%d] = op2[%d]\n", lane, elem, res_i, op_i); + } + } + + elems = res_bit_size / 64; + + for (elem = 0; elem < elems; elem++) { + unsigned short *res_wp = (unsigned short *)&vres->u64[elem]; + unsigned char *op1_bp = (unsigned char *)&vop1->u64[elem]; + unsigned char *tmp_bp = (unsigned char *)&vtmp.u64[elem]; + + res_wp[0] = abs(op1_bp[0] - tmp_bp[0]) + abs(op1_bp[1] - tmp_bp[1]) + + abs(op1_bp[2] - tmp_bp[2]) + abs(op1_bp[3] - tmp_bp[3]); + + res_wp[1] = abs(op1_bp[0] - tmp_bp[1]) + abs(op1_bp[1] - tmp_bp[2]) + + abs(op1_bp[2] - tmp_bp[3]) + abs(op1_bp[3] - tmp_bp[4]); + + res_wp[2] = abs(op1_bp[4] - tmp_bp[2]) + abs(op1_bp[5] - tmp_bp[3]) + + abs(op1_bp[6] - tmp_bp[4]) + abs(op1_bp[7] - tmp_bp[5]); + + res_wp[3] = abs(op1_bp[4] - tmp_bp[3]) + abs(op1_bp[5] - tmp_bp[4]) + + abs(op1_bp[6] - tmp_bp[5]) + abs(op1_bp[7] - tmp_bp[6]); + } + + // Apply masking to get final result. + // + elems = res_bit_size / 16; + + for (res_i = 0; res_i < elems; res_i++) { + int elem_mask; + + elem_mask = mask & (1 << res_i); + + // The unmasked computation above has taken care of + // the elem_mask == 1 case. + if (elem_mask == 0) { + if (zero_mask) { + // Zeroing behavior. + vres->u16[res_i] = 0; + } else { + // Blending behavior + vres->u16[res_i] = wres_orig.u16[res_i]; + } + } + } + + return vres->zmmi; +} + +// +// Mask values. +// + +#define KMASK8_NONE 0xff +#define KMASK16_NONE 0xffff +#define KMASK32_NONE 0xffffffff + +#define KMASK8_ONES 0xff +#define KMASK16_ONES 0xffff +#define KMASK32_ONES 0xffffffff + +#define KMASK8_ALT 0xaa +#define KMASK16_ALT 0xaaaa +#define KMASK32_ALT 0xaaaaaaaa + +// +// Immediate value. +// +#define IMM_3210 0xe4 + +// +// Tests for vdbpsadbw +// +void do_xdbpsadbw() { + XYDBPSADBW("EDBPSADBW", 128, _mm_, false, KMASK8_NONE, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ONES, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ALT, false, IMM_3210, x); + + XYDBPSADBW("EDBPSADBW", 128, _mm_, true, KMASK8_ALT, true, IMM_3210, x); +} + +void do_ydbpsadbw() { + XYDBPSADBW("YDBPSADBW", 256, _mm256_, false, KMASK16_NONE, false, IMM_3210, + y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ONES, false, IMM_3210, y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ALT, false, IMM_3210, y); + + XYDBPSADBW("YDBPSADBW", 256, _mm256_, true, KMASK16_ALT, true, IMM_3210, y); +} + +void do_zdbpsadbw() { + ZDBPSADBW("ZDBPSADBW", false, KMASK32_NONE, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ONES, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ALT, false, IMM_3210); + + ZDBPSADBW("ZDBPSADBW", true, KMASK32_ALT, true, IMM_3210); +} + +int main() { + init(); + + do_xdbpsadbw(); + do_ydbpsadbw(); + do_zdbpsadbw(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/psadbw.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.c @@ -0,0 +1,214 @@ +/* + * Test intrinsics for vptestm[bdqw] and vptestnm[bdqw]. + * Here we check for _mm512_test_[epi32|epi64]_mask intrinsics. + */ + +#include "m512_test_util.h" +#include + +volatile int ten = 10; + +#define TEST(base_intrin, mask_intrin, r1, r2, correct_res) \ + { \ + __int64 _i, _mask; \ + __int64 _cres = (correct_res); \ + __int64 _res = base_intrin((r1), (r2)); \ + if (_res != _cres) { \ + printf(#base_intrin "(" #r1 ", " #r2 ") failed\n"); \ + printf("Expected 0x%08x%08x; got 0x%08x%08x\n", (int)(_cres >> 32), \ + (int)_cres, (int)(_res >> 32), (int)_res); \ + n_errs++; \ + } \ + for (_i = 0; _i < ten; _i++) { \ + _mask = ((__int64)rand() << 32) | rand(); \ + _res = mask_intrin(_mask, (r1), (r2)); \ + _cres = (correct_res)&_mask; \ + if (_res != _cres) { \ + printf(#mask_intrin "(0x%08x%08x, " #r1 ", " #r2 ") " \ + "failed\n", \ + (int)(_mask >> 32), (int)_mask); \ + printf("Expected 0x%08x%08x; got 0x%08x%08x\n", (int)(_cres >> 32), \ + (int)_cres, (int)(_res >> 32), (int)_res); \ + n_errs++; \ + } \ + } \ + } + +V512 i8; +V512 mix8; +V512 i16; +V512 mix16; + +V512 i32; +V512 mix32; + +V512 i64; +V512 mix64; + +volatile int vol0 = 0; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = -1; + mix8.s8[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = -1; + mix16.s16[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = -1; + mix32.s32[i] = (i & 1) ? 0 : -1; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = -1; + mix64.s64[i] = (i & 1) ? 0 : -1; + } +} + +void NOINLINE do_ptestmb() { + TEST(_mm_test_epi8_mask, _mm_mask_test_epi8_mask, mix8.xmmi[0], i8.xmmi[0], + 0x5555); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi8_mask, _mm256_mask_test_epi8_mask, mix8.ymmi[0], + i8.ymmi[0], 0x55555555); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi8_mask, _mm512_mask_test_epi8_mask, mix8.zmmi, i8.zmmi, + 0x5555555555555555); +} + +void NOINLINE do_ptestmw() { + TEST(_mm_test_epi16_mask, _mm_mask_test_epi16_mask, mix16.xmmi[0], + i16.xmmi[0], 0x55); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi16_mask, _mm256_mask_test_epi16_mask, mix16.ymmi[0], + i16.ymmi[0], 0x5555); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi16_mask, _mm512_mask_test_epi16_mask, mix16.zmmi, + i16.zmmi, 0x55555555); +} + +void NOINLINE do_ptestmd() { + TEST(_mm_test_epi32_mask, _mm_mask_test_epi32_mask, mix32.xmmi[0], + i32.xmmi[0], 0x5); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi32_mask, _mm256_mask_test_epi32_mask, mix32.ymmi[0], + i32.ymmi[0], 0x55); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi32_mask, _mm512_mask_test_epi32_mask, mix32.zmmi, + i32.zmmi, 0x5555); +} + +void NOINLINE do_ptestmq() { + TEST(_mm_test_epi64_mask, _mm_mask_test_epi64_mask, mix64.xmmi[0], + i64.xmmi[0], 0x1); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_test_epi64_mask, _mm256_mask_test_epi64_mask, mix64.ymmi[0], + i64.ymmi[0], 0x5); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_test_epi64_mask, _mm512_mask_test_epi64_mask, mix64.zmmi, + i64.zmmi, 0x55); +} + +void NOINLINE do_ptestnmb() { + TEST(_mm_testn_epi8_mask, _mm_mask_testn_epi8_mask, mix8.xmmi[0], i8.xmmi[0], + 0xaaaa); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi8_mask, _mm256_mask_testn_epi8_mask, mix8.ymmi[0], + i8.ymmi[0], 0xaaaaaaaa); + + i8.xmmi[vol0] = i8.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi8_mask, _mm512_mask_testn_epi8_mask, mix8.zmmi, i8.zmmi, + 0xaaaaaaaaaaaaaaaa); +} + +void NOINLINE do_ptestnmw() { + TEST(_mm_testn_epi16_mask, _mm_mask_testn_epi16_mask, mix16.xmmi[0], + i16.xmmi[0], 0xaa); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi16_mask, _mm256_mask_testn_epi16_mask, mix16.ymmi[0], + i16.ymmi[0], 0xaaaa); + + i16.xmmi[vol0] = i16.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi16_mask, _mm512_mask_testn_epi16_mask, mix16.zmmi, + i16.zmmi, 0xaaaaaaaa); +} + +void NOINLINE do_ptestnmd() { + TEST(_mm_testn_epi32_mask, _mm_mask_testn_epi32_mask, mix32.xmmi[0], + i32.xmmi[0], 0xa); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi32_mask, _mm256_mask_testn_epi32_mask, mix32.ymmi[0], + i32.ymmi[0], 0xaa); + + i32.xmmi[vol0] = i32.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi32_mask, _mm512_mask_testn_epi32_mask, mix32.zmmi, + i32.zmmi, 0xaaaa); +} + +void NOINLINE do_ptestnmq() { + TEST(_mm_testn_epi64_mask, _mm_mask_testn_epi64_mask, mix64.xmmi[0], + i64.xmmi[0], 0x2); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm256_testn_epi64_mask, _mm256_mask_testn_epi64_mask, mix64.ymmi[0], + i64.ymmi[0], 0xa); + + i64.xmmi[vol0] = i64.xmmi[vol0]; /* No-op. */ + + TEST(_mm512_testn_epi64_mask, _mm512_mask_testn_epi64_mask, mix64.zmmi, + i64.zmmi, 0xaa); +} + +int main(int argc, char *argv[]) { + init(); + + do_ptestmb(); + do_ptestmw(); + do_ptestmd(); + do_ptestmq(); + + do_ptestnmb(); + do_ptestnmw(); + do_ptestnmd(); + do_ptestnmq(); + + if (n_errs) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/ptestm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.c @@ -0,0 +1,294 @@ +/* + * Test shifts and rotates. + * + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_shuffle_epi32() + * _mm_shufflehi_epi16() + * _mm_shufflelo_epi16() + * _mm256_shuffle_epi32() + * _mm256_shufflehi_epi16() + * _mm256_shufflelo_epi16() + * _mm512_shuffle_epi32() + * _mm512_shufflehi_epi16() + * _mm512_shufflelo_epi16() + */ + +#include "m512_test_util.h" +#include +#include + +V512 counts16, counts32, counts64, src, passthru, zeros; +__mmask8 k8 = 0xf9; +__mmask16 k16 = 0x9ffe; + +volatile int vol0; + +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_src_update() src.xmmi[vol0] = src.xmmi[vol0] +#define soft_counts16_update() counts16.xmmi[vol0] = counts16.xmmi[vol0] +#define soft_counts32_update() counts32.xmmi[vol0] = counts32.xmmi[vol0] +#define soft_counts64_update() counts64.xmmi[vol0] = counts64.xmmi[vol0] + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + counts32.s32[i] = 3; + zeros.u32[i] = 0; + src.s32[i] = -27 * i * i + 300 * i - 82; + if (i & 0x1) { + src.s32[i] *= -1; + } + passthru.s32[i] = 48 * i * i + 100 * i - 9; + } + + for (i = 0; i < 8; i++) { + counts64.s64[i] = 9; + } + + for (i = 0; i < 32; i++) { + counts16.s16[i] = 4; + } +} + + +void NOINLINE emulate_shuffle(void *presult, const void *p, +const void *mask_src, int size, int control, int mask) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + V512 *src = (V512 *)mask_src; + for (i = 0; i < size; i++) { + if (((1 << i) & mask) == 0) + result->u32[i] = src->u32[i]; + else + result->u32[i] = v->u32[4 * (i / 4) + ((control >> (2 * (i % 4))) & 3)]; + } +} + + +void NOINLINE emulate_shuffle16(void *presult, const void *p, +const void *mask_src, int size, int control, int mask, int order) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + V512 *src = (V512 *)mask_src; + for (i = 0; i < size; i++) { + if (((1 << i) & mask) == 0) { + result->u16[i] = src->u16[i]; + } else { + if ((i / 4) % 2 == order) { + result->u16[i] = v->u16[i]; + } else { + result->u16[i] = v->u16[4 * (i / 4) + ((control >> (2 * (i % 4))) & 3)]; + } + } + } +} + + +void NOINLINE do_shuffle_epi32() { + volatile V512 res; + V512 expected; + + // checking mm512 shuffle + soft_counts32_update(); + res.zmmi = _mm512_shuffle_epi32(src.zmmi, 3); + emulate_shuffle(&expected, &src, &zeros, 16, 3, 0xffff); + check_equal_nd(&res, &expected, 16, "_mm512_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shuffle_epi32(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle(&expected, &src, &passthru, 16, 3, k16); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shuffle_epi32(k16, src.zmmi, 3); + emulate_shuffle(&expected, &src, &zeros, 16, 3, k16); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); + + // checking mm256 shuffle + soft_counts32_update(); + res.ymmi[0] = _mm256_shuffle_epi32(src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 8, 3, 0xff); + check_equal_nd(&res, &expected, 8, "_mm256_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shuffle_epi32(passthru.ymmi[0], k8, src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &passthru, 8, 3, k8); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shuffle_epi32(k8, src.ymmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 8, 3, k8); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); + + // checking mm shuffle + soft_counts32_update(); + res.xmmi[0] = _mm_shuffle_epi32(src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 4, 3, 0xf); + check_equal_nd(&res, &expected, 4, "_mm_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shuffle_epi32(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &passthru, 4, 3, k8); + check_equal_nd(&res, &expected, 4, "_mm_mask_shuffle_epi32", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shuffle_epi32(k8, src.xmmi[0], 3); + emulate_shuffle(&expected, &src, &zeros, 4, 3, k8); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shuffle_epi32", __LINE__); + soft_counts32_update(); +} + +void NOINLINE do_shufflehi_epi16() { + volatile V512 res; + V512 expected; + + // checking mm512 shufflehi + soft_counts32_update(); + res.zmmi = _mm512_shufflehi_epi16(src.zmmi, 3); + emulate_shuffle16(&expected, &src, &src, 32, 3, 0xffffffff, 0); + check_equal_nd(&res, &expected, 16, "_mm512_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shufflehi_epi16(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &passthru, 32, 3, k16, 0); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shufflehi_epi16(k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &zeros, 32, 3, k16, 0); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + // checking mm256 shufflehi + soft_counts32_update(); + res.ymmi[0] = _mm256_shufflehi_epi16(src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, 0xffff, 0); + check_equal_nd(&res, &expected, 8, "_mm256_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shufflehi_epi16(passthru.ymmi[0], k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 16, 3, k16, 0); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shufflehi_epi16(k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, k16, 0); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + // checking mm shufflehi + soft_counts32_update(); + res.xmmi[0] = _mm_shufflehi_epi16(src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, 0xff, 0); + check_equal_nd(&res, &expected, 4, "_mm_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shufflehi_epi16(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 8, 3, k8, 0); + check_equal_nd(&res, &expected, 4, "_mm_mask_shufflehi_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shufflehi_epi16(k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, k8, 0); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shufflehi_epi16", __LINE__); + soft_counts32_update(); +} + +void NOINLINE do_shufflelo_epi16() { + volatile V512 res; + V512 expected; + + // checking mm512 shufflelo + soft_counts32_update(); + res.zmmi = _mm512_shufflelo_epi16(src.zmmi, 3); + emulate_shuffle16(&expected, &src, &src, 32, 3, 0xffffffff, 1); + check_equal_nd(&res, &expected, 16, "_mm512_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_mask_shufflelo_epi16(passthru.zmmi, k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &passthru, 32, 3, k16, 1); + check_equal_nd(&res, &expected, 16, "_mm512_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_shufflelo_epi16(k16, src.zmmi, 3); + emulate_shuffle16(&expected, &src, &zeros, 32, 3, k16, 1); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + // checking mm256 shufflelo + soft_counts32_update(); + res.ymmi[0] = _mm256_shufflelo_epi16(src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, 0xffff, 1); + check_equal_nd(&res, &expected, 8, "_mm256_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_mask_shufflelo_epi16(passthru.ymmi[0], k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 16, 3, k16, 1); + check_equal_nd(&res, &expected, 8, "_mm256_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.ymmi[0] = _mm256_maskz_shufflelo_epi16(k16, src.ymmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 16, 3, k16, 1); + check_equal_nd(&res, &expected, 8, "_mm256_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + // checking mm shufflelo + soft_counts32_update(); + res.xmmi[0] = _mm_shufflelo_epi16(src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, 0xff, 1); + check_equal_nd(&res, &expected, 4, "_mm_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_mask_shufflelo_epi16(passthru.xmmi[0], k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &passthru, 8, 3, k8, 1); + check_equal_nd(&res, &expected, 4, "_mm_mask_shufflelo_epi16", __LINE__); + soft_counts32_update(); + + soft_counts32_update(); + res.xmmi[0] = _mm_maskz_shufflelo_epi16(k8, src.xmmi[0], 3); + emulate_shuffle16(&expected, &src, &zeros, 8, 3, k8, 1); + check_equal_nd(&res, &expected, 4, "_mm_maskz_shufflelo_epi16", __LINE__); + soft_counts32_update(); +} + +int main(int argc, char *argv[]) { + init(); + + do_shuffle_epi32(); + do_shufflelo_epi16(); + do_shufflehi_epi16(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/shuffles.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c +++ SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.c @@ -0,0 +1,802 @@ +/* + * Here we check for punpck* and unpck* intrinsics using masm. + */ + +#include "m512_test_util.h" +#include +#include + +int verbose = 0; + +__m512i i1; +__m512i i2; +__m512i i3; +__m512i i4; +__m512i i5; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE display_xmm_pi(const void *p, const char *banner) { + int i; + V512 *v = (V512 *)p; + + if (banner) { + printf("%s", banner); + } + + for (i = 0; i < 4; i++) { + printf(" 0x%0.8x", v->s32[3 - i]); + } + printf("\n"); +} + +void NOINLINE init() { + int i; + V512 *pi1 = (V512 *)&i1; + V512 *pi2 = (V512 *)&i2; + V512 *pi3 = (V512 *)&i3; + + for (i = 0; i < 64; i++) { + pi1->u8[i] = 17 + ((i & 1) ? 1 : -1) * i + vol; + + pi2->u8[i] = 100 + ((i & 3) == 3 ? 1 : -1) * i + vol; + + pi3->u8[i] = 400 + ((i & 1) ? -1 : 1) * i + vol; + } +} + +#define check_equal_xmm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 4, banner, __LINE__) +#define check_equal_ymm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 8, banner, __LINE__) +#define check_equal_zmm(vgot, vexpected, banner) \ + check_equal_nd(vgot, vexpected, 16, banner, __LINE__) + +void NOINLINE emulate_palignr(void *presult, const void *p1, const void *p2, + int shift, int num_lanes) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + + if (shift < 0 || shift > 31) { + /* Result is zero. */ + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 4; i++) { + result->u32[4 * lane + i] = 0; + } + } + + return; + } + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < (16 - shift); i++) { + result->u8[16 * lane + i] = v2->u8[16 * lane + i + shift]; + } + for (; i < 16 && (i + shift < 32); i++) { + result->u8[16 * lane + i] = v1->u8[16 * lane + i - (16 - shift)]; + } + for (; i < 16; i++) { + result->u8[16 * lane + i] = 0; + } + } +} + +void NOINLINE emulate_punpck_bw(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 8 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 8; i++) { + result->u8[16 * lane + 2 * i] = v1->u8[16 * lane + i + offset]; + result->u8[16 * lane + 2 * i + 1] = v2->u8[16 * lane + i + offset]; + } + } +} + +#define emulate_punpckhbw(presult, p1, p2, num_lanes) \ + emulate_punpck_bw(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklbw(presult, p1, p2, num_lanes) \ + emulate_punpck_bw(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_wd(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 4 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 8; i++) { + result->u16[8 * lane + 2 * i] = v1->u16[8 * lane + i + offset]; + result->u16[8 * lane + 2 * i + 1] = v2->u16[8 * lane + i + offset]; + } + } +} + +#define emulate_punpckhwd(presult, p1, p2, num_lanes) \ + emulate_punpck_wd(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklwd(presult, p1, p2, num_lanes) \ + emulate_punpck_wd(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_dq(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 2 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 4; i++) { + result->u32[4 * lane + 2 * i] = v1->u32[4 * lane + i + offset]; + result->u32[4 * lane + 2 * i + 1] = v2->u32[4 * lane + i + offset]; + } + } +} + +#define emulate_punpckhdq(presult, p1, p2, num_lanes) \ + emulate_punpck_dq(presult, p1, p2, num_lanes, 1) +#define emulate_punpckldq(presult, p1, p2, num_lanes) \ + emulate_punpck_dq(presult, p1, p2, num_lanes, 0) + +void NOINLINE emulate_punpck_qdq(void *presult, const void *p1, const void *p2, + int num_lanes, int high) { + int i, lane; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + int offset = high ? 1 : 0; + + for (lane = 0; lane < num_lanes; lane++) { + for (i = 0; i < 2; i++) { + result->u64[2 * lane + 2 * i] = v1->u64[2 * lane + i + offset]; + result->u64[2 * lane + 2 * i + 1] = v2->u64[2 * lane + i + offset]; + } + } +} + +#define emulate_punpckhqdq(presult, p1, p2, num_lanes) \ + emulate_punpck_qdq(presult, p1, p2, num_lanes, 1) +#define emulate_punpcklqdq(presult, p1, p2, num_lanes) \ + emulate_punpck_qdq(presult, p1, p2, num_lanes, 0) + +void NOINLINE do_punpck_bw() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + +#define DO_XMM_REG_REG(opcode, reg1, reg2) \ + __asm {\ + __asm mov FULL_IREG(ax), [p1] \ + __asm movaps reg2, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm movaps reg1, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2 \ + __asm mov FULL_IREG(ax), [p3] \ + __asm movaps [FULL_IREG(ax)], reg1 \ + } + +#define DO_V_REG_REG_REG(opcode, reg1, reg2, reg3) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, reg3 \ + __asm mov FULL_IREG(ax), [p3] \ + __asm vmovaps [FULL_IREG(ax)], reg1} + + DO_XMM_REG_REG(punpckhbw, xmm6, xmm3) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhbw xmm6, xmm3"); + if (verbose) { + printf("punpckhbw(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklbw, xmm2, xmm7) + emulate_punpcklbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhbw xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm1, xmm6, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, ymm4, ymm7, ymm5) + emulate_punpckhbw(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhbw ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklbw, ymm7, ymm3, ymm1) + emulate_punpcklbw(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklbw ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm4, zmm7, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklbw, zmm7, zmm3, zmm1) + emulate_punpcklbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklbw zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhbw, xmm19, xmm6, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm19, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm6, xmm19, xmm5) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm6, xmm19, xmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, xmm6, xmm5, xmm19) + emulate_punpckhbw(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhbw xmm6, xmm5, xmm19"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm19, zmm6, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm19, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm6, zmm19, zmm5) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm6, zmm19, zmm5"); + + DO_V_REG_REG_REG(vpunpckhbw, zmm6, zmm5, zmm19) + emulate_punpckhbw(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhbw zmm6, zmm5, zmm19"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_wd() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhwd, xmm6, xmm3) + emulate_punpckhwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhwd xmm6, xmm3"); + if (verbose) { + printf("punpckhwd(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklwd, xmm2, xmm7) + emulate_punpcklwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhwd xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhwd, xmm1, xmm6, xmm5) + emulate_punpckhwd(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhwd xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm4, ymm7, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklwd, ymm7, ymm3, ymm1) + emulate_punpcklwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklwd ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm4, zmm7, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklwd, zmm7, zmm3, zmm1) + emulate_punpcklwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklwd zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhwd, ymm19, ymm6, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm19, ymm6, ymm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm6, ymm19, ymm5) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm6, ymm19, ymm5"); + + DO_V_REG_REG_REG(vpunpckhwd, ymm6, ymm5, ymm19) + emulate_punpckhwd(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhwd ymm6, ymm5, ymm19"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm19, zmm6, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm19, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm19, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm19, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm5, zmm19) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm5, zmm19"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm26, zmm6, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm26, zmm6, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm26, zmm5) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm26, zmm5"); + + DO_V_REG_REG_REG(vpunpckhwd, zmm6, zmm5, zmm26) + emulate_punpckhwd(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhwd zmm6, zmm5, zmm26"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +#define DO_Z_REG_MASK_REG_REG(opcode, reg1, kreg, reg2, reg3) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm kxnorw kreg, kreg, kreg \ + __asm opcode reg1{kreg} \ + , reg2, reg3 __asm mov FULL_IREG(ax), [p3] __asm vmovaps[FULL_IREG(ax)], \ + reg1 \ + } + +void NOINLINE do_punpck_dq() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhdq, xmm6, xmm3) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhdq xmm6, xmm3"); + if (verbose) { + printf("punpckhdq(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpckldq, xmm2, xmm7) + emulate_punpckldq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhdq xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm1, xmm6, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, ymm4, ymm7, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhdq ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpckldq, ymm7, ymm3, ymm1) + emulate_punpckldq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckldq ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpckldq, zmm7, zmm3, zmm1) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckldq zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG(vpunpckhdq, xmm23, xmm7, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm23, xmm7, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm7, xmm23, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm7, xmm23, xmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, xmm7, xmm5, xmm23) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhdq xmm7, xmm5, xmm23"); + + DO_V_REG_REG_REG(vpunpckhdq, ymm23, ymm16, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhdq ymm23, ymm16, ymm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm23, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm23, zmm7, zmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm7, zmm23, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm7, zmm23, zmm5"); + + DO_V_REG_REG_REG(vpunpckhdq, zmm7, zmm5, zmm23) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm7, zmm5, zmm23"); + + DO_Z_REG_MASK_REG_REG(vpunpckhdq, zmm23, k4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhdq zmm23{k4}, zmm7, zmm5"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_qdq() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(punpckhqdq, xmm6, xmm3) + emulate_punpckhqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhqdq xmm6, xmm3"); + if (verbose) { + printf("punpckhqdq(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(punpcklqdq, xmm2, xmm7) + emulate_punpcklqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "punpckhqdq xmm2, xmm7"); + + DO_V_REG_REG_REG(vpunpckhqdq, xmm1, xmm6, xmm5) + emulate_punpckhqdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vpunpckhqdq xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vpunpckhqdq, ymm4, ymm7, ymm5) + emulate_punpckhqdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpckhqdq ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vpunpcklqdq, ymm7, ymm3, ymm1) + emulate_punpcklqdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vpunpcklqdq ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vpunpckhqdq, zmm4, zmm7, zmm5) + emulate_punpckhqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpckhqdq zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vpunpcklqdq, zmm7, zmm3, zmm1) + emulate_punpcklqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklqdq zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_Z_REG_MASK_REG_REG(vpunpcklqdq, zmm31, k6, zmm29, zmm27) + emulate_punpcklqdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vpunpcklqdq zmm31{k6}, zmm29, zmm27"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_punpck_ps() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + + DO_XMM_REG_REG(unpckhps, xmm6, xmm3) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "unpckhps xmm6, xmm3"); + if (verbose) { + printf("unpckhps(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_XMM_REG_REG(unpcklps, xmm2, xmm7) + emulate_punpckldq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "unpckhps xmm2, xmm7"); + + DO_V_REG_REG_REG(vunpckhps, xmm1, xmm6, xmm5) + emulate_punpckhdq(&i4, &i2, &i1, 1); + check_equal_xmm(&i4, &i3, "vunpckhps xmm1, xmm6, xmm5"); + + DO_V_REG_REG_REG(vunpckhps, ymm4, ymm7, ymm5) + emulate_punpckhdq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vunpckhps ymm4, ymm7, ymm5"); + + DO_V_REG_REG_REG(vunpcklps, ymm7, ymm3, ymm1) + emulate_punpckldq(&i4, &i2, &i1, 2); + check_equal_ymm(&i4, &i3, "vunpcklps ymm7, ymm3, ymm1"); + + DO_V_REG_REG_REG(vunpckhps, zmm4, zmm7, zmm5) + emulate_punpckhdq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpckhps zmm4, zmm4, zmm5"); + + DO_V_REG_REG_REG(vunpcklps, zmm7, zmm3, zmm1) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpcklps zmm7, zmm3, zmm1"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_Z_REG_MASK_REG_REG(vunpcklps, zmm30, k5, zmm28, zmm26) + emulate_punpckldq(&i4, &i2, &i1, 4); + check_equal_zmm(&i4, &i3, "vunpcklps zmm30{k5}, zmm28, zmm26"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ +} + +void NOINLINE do_palignr() { + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + +#define DO_XMM_REG_REG_IMM(opcode, reg1, reg2, imm) \ + __asm {\ + __asm mov FULL_IREG(ax), [p1] \ + __asm movaps reg2, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm movaps reg1, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, imm \ + __asm mov FULL_IREG(ax), [p3] \ + __asm movaps [FULL_IREG(ax)], reg1 \ + } + +#define DO_V_REG_REG_REG_IMM(opcode, reg1, reg2, reg3, imm) \ + __asm { \ + __asm mov FULL_IREG(ax), [p1] \ + __asm vmovaps reg3, [FULL_IREG(ax)] \ + __asm mov FULL_IREG(ax), [p2] \ + __asm vmovaps reg2, [FULL_IREG(ax)] \ + __asm opcode reg1, reg2, reg3, imm \ + __asm mov FULL_IREG(ax), [p3] \ + __asm vmovaps [FULL_IREG(ax)], reg1} + + DO_XMM_REG_REG_IMM(palignr, xmm6, xmm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 1); + check_equal_xmm(&i4, &i3, "palignr xmm6, xmm3, 19"); + if (verbose) { + printf("palignr(i2, i1)\n"); + display_xmm_pi(&i1, "i1: "); + display_xmm_pi(&i2, "i2: "); + display_xmm_pi(&i3, "got: "); + display_xmm_pi(&i4, "exp: "); + } + + DO_V_REG_REG_REG_IMM(vpalignr, xmm6, xmm7, xmm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 1); + check_equal_xmm(&i4, &i3, "palignr xmm6, xmm7, xmm3, 19"); + + DO_V_REG_REG_REG_IMM(vpalignr, ymm6, ymm7, ymm3, 19) + emulate_palignr(&i4, &i2, &i1, 19, 2); + check_equal_ymm(&i4, &i3, "palignr ymm6, ymm7, ymm3, 19"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm4, zmm7, zmm5, 12) + emulate_palignr(&i4, &i2, &i1, 12, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm4, zmm4, zmm5, 12"); + +#if defined(__x86_64) || defined(_M_X64) + + DO_V_REG_REG_REG_IMM(vpalignr, ymm27, ymm5, ymm3, 18) + emulate_palignr(&i4, &i2, &i1, 18, 4); + check_equal_ymm(&i4, &i3, "vpalignr ymm27, ymm5, ymm3, 18"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm3, zmm5, zmm27, 9) + emulate_palignr(&i4, &i2, &i1, 9, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm3, zmm5, zmm27, 9"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm27, zmm5, zmm3, 22) + emulate_palignr(&i4, &i2, &i1, 22, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm27, zmm5, zmm3, 22"); + + DO_V_REG_REG_REG_IMM(vpalignr, zmm5, zmm27, zmm3, 13) + emulate_palignr(&i4, &i2, &i1, 13, 4); + check_equal_zmm(&i4, &i3, "vpalignr zmm5, zmm27, zmm3, 13"); + +#endif /* defined(__x86_64) || defined(_M_X64) */ + + i3 = _mm512_alignr_epi8(i2, i1, 6); + emulate_palignr(&i4, &i2, &i1, 6, 4); + check_equal_zmm(&i4, &i3, "_mm512_alignr_epi8"); +} + +void NOINLINE compare_reg_reg_vs_reg_mem() { + /* Check that zmm-memory operand forms are parsed and encoded properly. */ + + void *p1 = &i1; + void *p2 = &i2; + void *p3 = &i3; + void *p4 = &i4; + void *p5 = &i5; + + __asm { + __asm mov FULL_IREG(ax), [p1] + __asm vmovaps zmm1, [FULL_IREG(ax)] + __asm mov FULL_IREG(ax), [p2] + __asm vmovaps zmm2, [FULL_IREG(ax)] + __asm mov FULL_IREG(ax), [p3] + __asm vmovaps zmm3, [FULL_IREG(ax)] + + __asm vpxord zmm6, zmm6, zmm6 + __asm vpxord zmm7, zmm7, zmm7 + + /* vpunpckhbw */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhbw zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhbw zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpcklbw */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpcklbw zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklbw zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpckhwd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhwd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhwd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpcklwd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpcklwd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklwd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpunpckhdq */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckhdq zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckhdq zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + __asm mov FULL_IREG(ax), [p3] + __asm vpunpcklqdq zmm5, zmm2, [FULL_IREG(ax)]{1to8} + + /* vpunpckldq */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpunpckldq zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpunpckldq zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpckhps */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpckhps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpckhps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpcklps */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpcklps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpcklps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpckhpd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpckhpd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpckhpd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vunpcklpd */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vunpcklpd zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vunpcklpd zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpermilps reg */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpermilps zmm4, zmm2, zmm3 + __asm mov FULL_IREG(ax), [p3] + __asm vpermilps zmm5, zmm2, [FULL_IREG(ax)] + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpermilps imm */ + + __asm vmovaps zmm4, zmm1 + __asm vmovaps zmm5, zmm1 + __asm vpermilps zmm4, zmm2, 0x35 + __asm mov FULL_IREG(ax), [p2] + __asm vpermilps zmm5, [FULL_IREG(ax)], 0x35 + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vshufps */ + + /* __asm mov FULL_IREG(ax), [p3] */ + /* __asm vbroadcastf32x4 zmm5, [FULL_IREG(ax)] */ + /* __asm vshufps zmm4, zmm2, zmm5, 0x65 */ + __asm vshufps zmm4, zmm2, zmm3, 0x65 + __asm mov FULL_IREG(ax), [p3] + __asm vshufps zmm5, zmm2, [FULL_IREG(ax)], 0x65 + /* __asm vshufps zmm5, zmm2, [FULL_IREG(ax)]{4to16}, 0x65 */ + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* vpalignr */ + + __asm vpalignr zmm4, zmm2, zmm3, 0xd + __asm mov FULL_IREG(ax), [p3] + __asm vpalignr zmm5, zmm2, [FULL_IREG(ax)], 0xd + __asm vpsubd zmm6, zmm4, zmm5 + __asm vpord zmm7, zmm7, zmm6 + + /* Cumulative difference from zero is in zmm7, save this in i5. */ + + __asm mov FULL_IREG(ax), [p5] + __asm vmovaps[FULL_IREG(ax)], zmm7 + + /* Expected difference is zero, put zero in i4. */ + + __asm vpxord zmm7, zmm7, zmm7 + __asm mov FULL_IREG(ax), [p4] + __asm vmovaps[FULL_IREG(ax)], zmm7 +} + +check_equal_zmm(&i5, &i4, "various 512-bit reg-reg vs reg-mem"); +} + +int main(int argc, char *argv[]) { + if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' && + argv[1][2] == '\0') { + verbose = 1; + } + + init(); + do_punpck_bw(); + do_punpck_wd(); + do_punpck_dq(); + do_punpck_qdq(); + do_punpck_ps(); + do_palignr(); + compare_reg_reg_vs_reg_mem(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output +++ SingleSource/UnitTests/Vector/AVX512BWVL/unpack_msasm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512DQ/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQ/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512DQ/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=${X86CPU_ARCH}") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512DQ-") Index: SingleSource/UnitTests/Vector/AVX512DQ/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQ/Makefile +++ SingleSource/UnitTests/Vector/AVX512DQ/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512DQ/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mavx512dq -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mavx512dq +LCCFLAGS += -march=native -mavx512dq Index: SingleSource/UnitTests/Vector/AVX512DQ/casts.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQ/casts.c +++ SingleSource/UnitTests/Vector/AVX512DQ/casts.c @@ -0,0 +1,215 @@ +/* + * Here we check for _mm512_cast* and _mm512_xor_* intrinsics. + */ +#include "m512_test_util.h" + +volatile int vol = 0; /* Inhibit optimization */ + +__m512 f1, f2, f3, f3_orig; +__m512d d1, d2, d3, d3_orig; +__m512i i1, i2, i3, i3_orig; + +void NOINLINE set_nonzero(void *vp, int c) { + int i; + V512 *v = (V512 *)vp; + + for (i = 0; i < 16; i++) { + v->u32[i] = 10 * i * i - 3 * i + c; + if (v->u32[i] == 0) { + v->u32[i] = 1234; + } + } +} + +void NOINLINE check_zero(void *vp, char *banner) { + int i; + V512 *v = (V512 *)vp; + + for (i = 0; i < 16; i++) { + if (v->u32[i] + vol != 0) { + printf("ERROR: %s failed\n", banner ? banner : ""); + n_errs++; + break; + } + } +} + +void NOINLINE do_setzero() { + set_nonzero(&f1, 18); + f1 = _mm512_setzero_ps(); + check_zero(&f1, "_mm512_setzero_ps"); + + set_nonzero(&f2, 19); + f2 = _mm512_setzero(); + check_zero(&f2, "_mm512_setzero"); + + set_nonzero(&d1, 20); + d1 = _mm512_setzero_pd(); + check_zero(&d1, "_mm512_setzero_pd"); + + set_nonzero(&i1, 21); + i1 = _mm512_setzero_epi32(); + check_zero(&i1, "_mm512_setzero_epi32"); +} + +void NOINLINE do_cast() { + set_nonzero(&f1, 1); + d1 = _mm512_setzero_pd(); + f1 = _mm512_castpd_ps(d1); + check_zero(&f1, "_mm512_castpd_ps"); + + set_nonzero(&i1, 1); + d1 = _mm512_setzero_pd(); + i1 = _mm512_castpd_si512(d1); + check_zero(&i1, "_mm512_castpd_si512"); + + set_nonzero(&d2, 1); + f2 = _mm512_setzero_ps(); + d2 = _mm512_castps_pd(f2); + check_zero(&d2, "_mm512_castps_pd"); + + set_nonzero(&i2, 1); + f2 = _mm512_setzero_ps(); + i2 = _mm512_castps_si512(f2); + check_zero(&i2, "_mm512_castps_si512"); + + set_nonzero(&f3, 1); + i3 = _mm512_setzero_epi32(); + f3 = _mm512_castsi512_ps(i3); + check_zero(&f3, "_mm512_castsi512_ps"); + + set_nonzero(&d3, 1); + i3 = _mm512_setzero_epi32(); + d3 = _mm512_castsi512_pd(i3); + check_zero(&d3, "_mm512_castsi512_pd"); +} + +void NOINLINE do_size_casts() { + __m128d xd; + __m128 xs; + __m128i xi; + __m256d yd; + __m256 ys; + __m256i yi; + __m512d zd; + __m512 zs; + __m512i zi; + + set_nonzero(&f1, 1); + set_nonzero(&i1, 1); + set_nonzero(&d1, 1); + + xd = _mm512_castpd512_pd128(d1); + check_equal_nd(&xd, &d1, 4, "_mm512_castpd512_pd128", __LINE__); + xs = _mm512_castps512_ps128(f1); + check_equal_nd(&xs, &f1, 4, "_mm512_castps512_ps128", __LINE__); + xi = _mm512_castsi512_si128(i1); + check_equal_nd(&xi, &i1, 4, "_mm512_castsi512_si128", __LINE__); + + yd = _mm512_castpd512_pd256(d1); + check_equal_nd(&yd, &d1, 8, "_mm512_castpd512_pd256", __LINE__); + ys = _mm512_castps512_ps256(f1); + check_equal_nd(&ys, &f1, 8, "_mm512_castps512_ps256", __LINE__); + yi = _mm512_castsi512_si256(i1); + check_equal_nd(&yi, &i1, 8, "_mm512_castsi512_si256", __LINE__); + + zd = _mm512_castpd128_pd512(xd); + check_equal_nd(&zd, &d1, 4, "_mm512_castpd128_pd512", __LINE__); + zs = _mm512_castps128_ps512(xs); + check_equal_nd(&zs, &f1, 4, "_mm512_castps128_ps512", __LINE__); + zi = _mm512_castsi128_si512(xi); + check_equal_nd(&zi, &i1, 4, "_mm512_castsi128_si512", __LINE__); + + zd = _mm512_castpd256_pd512(yd); + check_equal_nd(&zd, &d1, 8, "_mm512_castpd256_pd512", __LINE__); + zs = _mm512_castps256_ps512(ys); + check_equal_nd(&zs, &f1, 8, "_mm512_castps256_ps512", __LINE__); + zi = _mm512_castsi256_si512(yi); + check_equal_nd(&zi, &i1, 8, "_mm512_castsi256_si512", __LINE__); +} + +void NOINLINE check_xor(void *vp1, void *vp2, void *vp3, void *vp_orig, + int mask, char *banner) { + int i; + V512 *v1 = (V512 *)vp1; + V512 *v2 = (V512 *)vp2; + V512 *v3 = (V512 *)vp3; + V512 *v_orig = (V512 *)vp_orig; + + for (i = 0; i < 16; i++) { + int actual = v3->u32[i]; + int expected = v_orig->u32[i]; + if (mask & (1 << i)) { + expected = v1->u32[i] ^ v2->u32[i]; + } + if (actual + vol != expected - vol) { + printf("ERROR: %s failed\n", banner ? banner : ""); + n_errs++; + break; + } + } +} + +void NOINLINE do_xor() { + set_nonzero(&i1, 99); + set_nonzero(&i2, 100); + set_nonzero(&f1, 33); + set_nonzero(&f2, -35); + set_nonzero(&d1, -11); + set_nonzero(&d2, 14); + + set_nonzero(&i3, 1000); + i3_orig = i3; + i3 = _mm512_xor_epi32(i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi32"); + + set_nonzero(&i3, 1500); + i3_orig = i3; + i3 = _mm512_mask_xor_epi32(i3_orig, 0x5555, i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0x5555, "_mm512_mask_xor_epi32"); + + set_nonzero(&f3, 1000); + f3_orig = f3; + f3 = _mm512_xor_ps(f1, f2); + check_xor(&f1, &f2, &f3, &f3_orig, 0xffff, "_mm512_xor_ps"); + + set_nonzero(&f3, 1500); + f3_orig = f3; + f3 = _mm512_mask_xor_ps(f3_orig, 0x5795, f1, f2); + check_xor(&f1, &f2, &f3, &f3_orig, 0x5795, "_mm512_mask_xor_ps"); + + set_nonzero(&i3, 2000); + i3_orig = i3; + i3 = _mm512_xor_epi64(i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi64"); + + set_nonzero(&i3, 2500); + i3_orig = i3; + i3 = _mm512_mask_xor_epi64(i3_orig, 0x55, i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0x3333, "_mm512_mask_xor_epi64"); + + set_nonzero(&d3, 2000); + d3_orig = d3; + d3 = _mm512_xor_pd(d1, d2); + check_xor(&d1, &d2, &d3, &d3_orig, 0xffff, "_mm512_xor_pd"); + + set_nonzero(&d3, 2500); + d3_orig = d3; + d3 = _mm512_mask_xor_pd(d3_orig, 0x55, d1, d2); + check_xor(&d1, &d2, &d3, &d3_orig, 0x3333, "_mm512_mask_xor_pd"); +} + +int main() { + do_setzero(); + do_cast(); + do_size_casts(); + do_xor(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512DQ/casts.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512DQ/casts.reference_output +++ SingleSource/UnitTests/Vector/AVX512DQ/casts.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512F/CMakeLists.txt @@ -1,3 +1,4 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR}) list(APPEND LDFLAGS -lm) list(APPEND CFLAGS "-march=${X86CPU_ARCH}") list(APPEND CFLAGS -fms-extensions) Index: SingleSource/UnitTests/Vector/AVX512F/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/Makefile +++ SingleSource/UnitTests/Vector/AVX512F/Makefile @@ -2,7 +2,7 @@ DIRS = LEVEL = ../../../.. -CFLAGS += -fms-extensions -march=native -mavx512f +CFLAGS += -fms-extensions -march=native -mavx512f -I${SourceDir}/.. LDFLAGS += -lm include $(LEVEL)/SingleSource/Makefile.singlesrc Index: SingleSource/UnitTests/Vector/AVX512F/alignr.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/alignr.c +++ SingleSource/UnitTests/Vector/AVX512F/alignr.c @@ -0,0 +1,285 @@ +/* + * Test 512-bit intrinsics related to valignd and valignq. + * Here we check for _mm512_[mask|maskz]_alignr_epi* + * intrinsics. + */ + +#include "m512_test_util.h" +#include + +volatile int vol0 = 0; + +#define soft_update(v512) (v512).xmmi[vol0] = (v512).xmmi[vol0] + +#define TEST_MASK(instr, elembits, mask, imm, num_elems, dtype) \ + { /* Blend masking */ \ + memcpy(&res, &dtype##res_orig, sizeof(res)); \ + soft_update(dtype##op2); \ + res.zmmi = _mm512_mask_alignr_epi##elembits( \ + res.zmmi, mask, dtype##op1.zmmi, dtype##op2.zmmi, imm); \ + \ + /* Compute the expected result. */ \ + expect.zmmi = \ + compute_##instr(mask, 0, &dtype##op1, &dtype##op2, imm, num_elems); \ + \ + /* Compare the obtained and expected results. */ \ + check_equal_n##dtype(&res, &expect, num_elems, \ + "_mm512_mask_alignr_epi" #elembits ", " #imm, \ + __LINE__); \ + /* Verify combination with masked load. */ \ + { \ + __m512i src2_copy, src2 = dtype##op1.zmmi; \ + memcpy(&res, &dtype##res_orig, sizeof(res)); \ + soft_update(dtype##op2); \ + src2 = _mm512_mask_load_epi##elembits(src2, mask, &dtype##op2.zmmi); \ + res.zmmi = _mm512_mask_alignr_epi##elembits(res.zmmi, mask, \ + dtype##op1.zmmi, src2, imm); \ + soft_update(dtype##op2); \ + src2_copy = _mm512_mask_load_epi##elembits(dtype##op1.zmmi, mask, \ + &dtype##op2.zmmi); \ + expect.zmmi = \ + compute_##instr(mask, 0, &dtype##op1, &src2_copy, imm, num_elems); \ + check_equal_n##dtype(&res, &expect, num_elems, \ + "mix with load _mm512_mask_alignr_epi" #elembits ", \ + " #imm, __LINE__); \ + } \ + } + +#define TEST_MASKZ(instr, elembits, mask, imm, num_elems, dtype) \ + { \ + /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + soft_update(dtype##op2); \ + res.zmmi = _mm512_maskz_alignr_epi##elembits(mask, dtype##op1.zmmi, \ + dtype##op2.zmmi, imm); \ + \ + /* Compute the expected result. */ \ + expect.zmmi = \ + compute_##instr(mask, 1, &dtype##op1, &dtype##op2, imm, num_elems); \ + \ + /* Compare the obtained and expected results. */ \ + check_equal_n##dtype(&res, &expect, num_elems, \ + "_mm512_maskz_alignr_epi" #elembits ", " #imm, \ + __LINE__); \ + } + +#define TEST(instr, elembits, imm, num_elems, dtype) \ + { \ + /* No masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + soft_update(dtype##op2); \ + res.zmmi = \ + _mm512_alignr_epi##elembits(dtype##op1.zmmi, dtype##op2.zmmi, imm); \ + \ + /* Compute the expected result. */ \ + expect.zmmi = compute_##instr((1 << (num_elems)) - 1, 0, &dtype##op1, \ + &dtype##op2, imm, num_elems); \ + \ + /* Compare the obtained and expected results. */ \ + check_equal_n##dtype(&res, &expect, num_elems, \ + "_mm512_alignr_epi" #elembits ", " #imm, __LINE__); \ + } + +#define TEST_ALIGN(instr, elembits, mask, imm, num_elems, dtype) \ + TEST_MASK(instr, elembits, mask, imm, num_elems, dtype) \ + TEST_MASKZ(instr, elembits, mask, imm, num_elems, dtype) \ + TEST(instr, elembits, imm, num_elems, dtype) + +#define TEST_ALIGND(mask, imm) TEST_ALIGN(zalignd, 32, mask, imm, 16, d) + +#define TEST_ALIGNQ(mask, imm) TEST_ALIGN(zalignq, 64, mask, imm, 8, q) + +V512 dop1, dop2, dres_orig; +V512 qop1, qop2, qres_orig; +V512 res, expect; + +volatile unsigned int dres_orig_arr[16] = { + 0x12345678, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, + 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff}; + +volatile U64 qres_orig_arr[8] = {0x123456789abcdef0, 0x1111111111111111, + 0x2222222222222222, 0x3333333333333333, + 0x4444444444444444, 0x5555555555555555, + 0x7777777777777777, 0x6666666666666666}; + +static void NOINLINE init() { + int i; + + for (i = 0; i < 16; i++) { + dop1.u32[i] = 0x11000000 + i; + } + for (i = 0; i < 16; i++) { + dop2.u32[i] = 0x22000000 + i; + } + + for (i = 0; i < 8; i++) { + qop1.u64[i] = 0x1111000000000000 + i; + } + for (i = 0; i < 8; i++) { + qop2.u64[i] = 0x2222000000000000 + i; + } + + memcpy((void *)&dres_orig, (void *)dres_orig_arr, 64); + memcpy((void *)&qres_orig, (void *)qres_orig_arr, 64); +} + +__m512i NOINLINE compute_zalignd(__mmask16 mask, int zero_mask, const void *op1, + const void *op2, int imm, int num_elems) { + V512 res; + int i, res_idx; + + res_idx = 0; + for (i = 0; i < 2; i++) { + int lower, upper, op_idx; + unsigned int *vop; + + if (i == 0) { + lower = imm; + upper = num_elems; + vop = (unsigned int *)op2; + } else { + lower = 0; + upper = imm; + vop = (unsigned int *)op1; + } + + for (op_idx = lower; op_idx < upper; op_idx++) { + + int elem_mask = mask & (1 << res_idx); + + if (elem_mask) { + res.u32[res_idx] = vop[op_idx]; + } else if (zero_mask) { + res.u32[res_idx] = 0; + } else { + res.u32[res_idx] = dres_orig.u32[res_idx]; + } + + res_idx++; + } + } + + return res.zmmi; +} + +void NOINLINE do_zalignd() { + TEST_ALIGND(0x0000, 0); + TEST_ALIGND(0xabcd, 0); + TEST_ALIGND(0xffff, 0); + + TEST_ALIGND(0x0000, 1); + TEST_ALIGND(0xabcd, 1); + TEST_ALIGND(0xfef7, 1); + TEST_ALIGND(0xffff, 1); + + TEST_ALIGND(0xabcd, 3); + TEST_ALIGND(0xfefe, 5); + + TEST_ALIGND(0x0000, 7); + TEST_ALIGND(0xabcd, 7); + TEST_ALIGND(0xffff, 7); + + TEST_ALIGND(0x0000, 8); + TEST_ALIGND(0xabcd, 8); + TEST_ALIGND(0xffff, 8); + + TEST_ALIGND(0x0000, 9); + TEST_ALIGND(0xabcd, 9); + TEST_ALIGND(0xffff, 9); + + TEST_ALIGND(0x0000, 14); + TEST_ALIGND(0xabcd, 14); + TEST_ALIGND(0xfef7, 14); + TEST_ALIGND(0xffff, 14); + + TEST_ALIGND(0x0000, 15); + TEST_ALIGND(0xabcd, 15); + TEST_ALIGND(0xffff, 15); +} + +__m512i NOINLINE compute_zalignq(int mask, int zero_mask, const void *op1, + const void *op2, int imm, int num_elems) { + V512 res; + int i, res_idx; + + res_idx = 0; + for (i = 0; i < 2; i++) { + int lower, upper, op_idx; + U64 *vop; + + if (i == 0) { + lower = imm; + upper = num_elems; + vop = (U64 *)op2; + } else { + lower = 0; + upper = imm; + vop = (U64 *)op1; + } + + for (op_idx = lower; op_idx < upper; op_idx++) { + + int elem_mask = mask & (1 << res_idx); + + if (elem_mask) { + res.u64[res_idx] = vop[op_idx]; + } else if (zero_mask) { + res.u64[res_idx] = 0; + } else { + res.u64[res_idx] = qres_orig.u64[res_idx]; + } + + res_idx++; + } + } + + return res.zmmi; +} + +void NOINLINE do_zalignq() { + TEST_ALIGNQ(0x00, 0); + TEST_ALIGNQ(0xbe, 0); + TEST_ALIGNQ(0xff, 0); + + TEST_ALIGNQ(0x00, 1); + TEST_ALIGNQ(0xbe, 1); + TEST_ALIGNQ(0xff, 1); + + TEST_ALIGNQ(0x00, 3); + TEST_ALIGNQ(0xbe, 3); + TEST_ALIGNQ(0xff, 3); + + TEST_ALIGNQ(0x00, 4); + TEST_ALIGNQ(0xbe, 4); + TEST_ALIGNQ(0xff, 4); + + TEST_ALIGNQ(0x00, 5); + TEST_ALIGNQ(0xbe, 5); + TEST_ALIGNQ(0xff, 5); + + TEST_ALIGNQ(0x00, 6); + TEST_ALIGNQ(0xbe, 6); + TEST_ALIGNQ(0xff, 6); + + TEST_ALIGNQ(0x00, 7); + TEST_ALIGNQ(0xbe, 7); + TEST_ALIGNQ(0xe7, 7); + TEST_ALIGNQ(0xff, 7); +} + +int main() { + init(); + + do_zalignd(); + + do_zalignq(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/alignr.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/alignr.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/alignr.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/broadcast.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/broadcast.c +++ SingleSource/UnitTests/Vector/AVX512F/broadcast.c @@ -0,0 +1,144 @@ +/* + * Test broadcast instructions. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_broadcast_f32x4() + * _mm512_broadcast_f64x4() + * _mm512_broadcast_i32x4() + * _mm512_broadcastload_i64x4() + * _mm512_mask_broadcast_f32x4() + * _mm512_mask_broadcast_f64x4() + * _mm512_mask_broadcast_i32x4() + * _mm512_mask_broadcast_i64x4() + * _mm512_maskz_broadcast_f32x4() + * _mm512_maskz_broadcast_f64x4() + * _mm512_maskz_broadcast_i32x4() + * _mm512_maskz_broadcast_i64x4() + */ + +#include "m512_test_util.h" + +V512 i32; +V512 f32; +V512 i64; +V512 f64; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + f32.f32[i] = (float)i; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + f64.f64[i] = (double)i; + } +} + +void NOINLINE do_32x4() { + V512 res; + V512 expected; + __mmask16 k; + + /*************************** 512 bit intrinsics ***********************/ + /* Unmasked op */ + res.zmm = _mm512_broadcast_f32x4(f32.xmm[0]); + expected.zmm = _mm512_setr4_ps(0.0f, 1.0f, 2.0f, 3.0f); + check_equal_nd(&res, &expected, 16, "_mm512_broadcast_f32x4", __LINE__); + + /* Write mask */ + k = 0x7e9a; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_broadcast_f32x4(res.zmm, k, f32.xmm[0]); + expected.zmm = _mm512_setr4_ps(0.0f, 1.0f, 2.0f, 3.0f); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, expected.zmm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_broadcast_f32x4", __LINE__); + + /* Zero mask */ + res.zmm = _mm512_maskz_broadcast_f32x4(k, f32.xmm[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_broadcast_f32x4", __LINE__); + + /* Unmasked op */ + res.zmmi = _mm512_broadcast_i32x4(i32.xmmi[0]); + expected.zmmi = _mm512_setr4_epi32(0, 1, 2, 3); + check_equal_nd(&res, &expected, 16, "_mm512_broadcast_i32x4", __LINE__); + + /* Write mask */ + k = 0x789a; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_broadcast_i32x4(res.zmmi, k, i32.xmmi[0]); + expected.zmmi = _mm512_setr4_epi32(0, 1, 2, 3); + expected.zmmi = + _mm512_mask_mov_epi32(_mm512_setzero_epi32(), k, expected.zmmi); + check_equal_nd(&res, &expected, 16, "_mm512_mask_broadcast_i32x4", __LINE__); + + /* Zero mask */ + res.zmmi = _mm512_maskz_broadcast_i32x4(k, i32.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_broadcast_i32x4", __LINE__); + + /*************************** 256 bit intrinsics ***********************/ + + /* Write mask */ + k = 0x7e9a; + res.zmmi = _mm512_setzero_epi32(); + expected.zmm = _mm512_setr4_ps(0.0f, 1.0f, 2.0f, 3.0f); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, expected.zmm); +} + +void NOINLINE do_64x4() { + V512 res; + V512 expected; + __mmask8 k; + + /* Unmasked op */ + res.zmmd = _mm512_broadcast_f64x4(f64.ymmd[0]); + expected.zmmd = _mm512_set_pd(3.0, 2.0, 1.0, 0.0, 3.0, 2.0, 1.0, 0.0); + check_equal_nd(&res, &expected, 16, "_mm512_broadcast_f64x4", __LINE__); + + /* Write mask */ + k = 0xe4; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_broadcast_f64x4(res.zmmd, k, f64.ymmd[0]); + expected.zmmd = _mm512_set_pd(3.0, 2.0, 1.0, 0.0, 3.0, 2.0, 1.0, 0.0); + expected.zmmd = _mm512_mask_mov_pd(_mm512_setzero_pd(), k, expected.zmmd); + check_equal_nd(&res, &expected, 16, "_mm512_mask_broadcast_f64x4", __LINE__); + + /* Zero mask */ + res.zmmd = _mm512_maskz_broadcast_f64x4(k, f64.ymmd[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_broadcast_f64x4", __LINE__); + + /* Unmasked op */ + res.zmmi = _mm512_broadcast_i64x4(i64.ymmi[0]); + expected.zmmi = _mm512_set_epi64(3, 2, 1, 0, 3, 2, 1, 0); + check_equal_nd(&res, &expected, 16, "_mm512_broadcast_i64x4", __LINE__); + + /* Write mask */ + k = 0xdf; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_broadcast_i64x4(res.zmmi, k, i64.ymmi[0]); + expected.zmmi = _mm512_set_epi64(3, 2, 1, 0, 3, 2, 1, 0); + expected.zmmi = + _mm512_mask_mov_epi64(_mm512_setzero_epi32(), k, expected.zmmi); + check_equal_nd(&res, &expected, 16, "_mm512_mask_broadcast_i64x4", __LINE__); + + /* Zero mask */ + res.zmmi = _mm512_maskz_broadcast_i64x4(k, i64.ymmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_broadcast_i64x4", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_32x4(); + do_64x4(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/broadcast.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/broadcast.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/broadcast.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/compress.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/compress.c +++ SingleSource/UnitTests/Vector/AVX512F/compress.c @@ -0,0 +1,241 @@ + +/* + * Test the compress family of intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_mask_compress_epi32() + * _mm512_mask_compress_epi64() + * _mm512_mask_compress_ps() + * _mm512_mask_compress_pd() + * _mm512_mask_compressstoreu_epi32() + * _mm512_mask_compressstoreu_epi64() + * _mm512_mask_compressstoreu_ps() + * _mm512_mask_compressstoreu_pd() + */ + +#include "m512_test_util.h" +#include + +V512 i32; +V512 i64; +V512 f32; +V512 f64; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + f32.f32[i] = i; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + f64.f64[i] = i; + } +} + +/* + * Check that the low N 32-bit elements of "got" and "expected" are the same. + */ +void NOINLINE check_equal_n(void *got, void *expected, int n_elems, + char *banner) { + int i; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u32[i] != v2->u32[i]) { + printf("ERROR: %s failed at %d'th element: 0x%0.8x != 0x%0.8x\n", + banner ? banner : "", i, v1->u32[i], v2->u32[i]); + n_errs++; + break; + } + } +} + +void NOINLINE do_512_compress_epi32() { + V512 res; + V512 expected; + volatile int i, j; + __mmask16 k = 0x7923; + + res.zmmi = _mm512_mask_compress_epi32(i32.zmmi, k, i32.zmmi); + for (i = 0, j = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[j++] = i32.s32[i]; + } + } + for (i = j; i < 16; i++) { + expected.s32[i] = i32.s32[i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compress_epi32", __LINE__); +} + +void NOINLINE do_512_compress_epi64() { + V512 res; + V512 expected; + volatile int i, j; + __mmask8 k = 0xbd; + + res.zmmi = _mm512_mask_compress_epi64(i64.zmmi, k, i64.zmmi); + for (i = 0, j = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.s64[j++] = i64.s64[i]; + } + } + for (i = j; i < 8; i++) { + expected.s64[i] = i64.s64[i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compress_epi64", __LINE__); +} + +void NOINLINE do_512_compress_ps() { + V512 res; + V512 expected; + volatile int i, j; + __mmask16 k = 0xabcd; + + res.zmm = _mm512_mask_compress_ps(f32.zmm, k, f32.zmm); + for (i = 0, j = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[j++] = f32.f32[i]; + } + } + for (i = j; i < 16; i++) { + expected.f32[i] = f32.f32[i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compress_ps", __LINE__); +} + +void NOINLINE do_512_compress_pd() { + V512 res; + V512 expected; + volatile int i, j; + __mmask8 k = 0x57; + + res.zmmd = _mm512_mask_compress_pd(f64.zmmd, k, f64.zmmd); + for (i = 0, j = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[j++] = f64.f64[i]; + } + } + for (i = j; i < 8; i++) { + expected.f64[i] = f64.f64[i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compress_pd", __LINE__); +} + +void NOINLINE do_512_compressstore_epi32() { + V512 res; + V512 expected; + volatile int i, j; + __mmask16 k = 0x3297; + + for (i = 0; i < 16; i++) { + res.s32[i] = -13; + } + + _mm512_mask_compressstoreu_epi32(&res, k, i32.zmmi); + for (i = 0, j = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[j++] = i32.s32[i]; + } + } + for (i = j; i < 16; i++) { + expected.s32[i] = -13; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compressstoreu_epi32", + __LINE__); +} + +void NOINLINE do_512_compressstore_epi64() { + V512 res; + V512 expected; + volatile int i, j; + __mmask8 k = 0x9c; + + for (i = 0; i < 8; i++) { + res.s64[i] = -72; + } + + _mm512_mask_compressstoreu_epi64(&res, k, i64.zmmi); + for (i = 0, j = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.s64[j++] = i64.s64[i]; + } + } + for (i = j; i < 8; i++) { + expected.s64[i] = -72; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compressstoreu_epi64", + __LINE__); +} + +void NOINLINE do_512_compressstore_ps() { + V512 res; + V512 expected; + volatile int i, j; + __mmask16 k = 0xdcf3; + + for (i = 0; i < 16; i++) { + res.f32[i] = -100.0f; + } + + _mm512_mask_compressstoreu_ps(&res, k, f32.zmm); + for (i = 0, j = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[j++] = f32.f32[i]; + } + } + for (i = j; i < 16; i++) { + expected.f32[i] = -100.0f; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compressstoreu_ps", + __LINE__); +} + +void NOINLINE do_512_compressstore_pd() { + V512 res; + V512 expected; + volatile int i, j; + __mmask8 k = 0xf5; + + for (i = 0; i < 8; i++) { + res.f64[i] = -99.0; + } + + _mm512_mask_compressstoreu_pd(&res, k, f64.zmmd); + for (i = 0, j = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[j++] = f64.f64[i]; + } + } + for (i = j; i < 8; i++) { + expected.f64[i] = -99.0; + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_compressstoreu_pd", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_512_compress_epi32(); + do_512_compress_epi64(); + do_512_compress_ps(); + do_512_compress_pd(); + + do_512_compressstore_epi32(); + do_512_compressstore_epi64(); + do_512_compressstore_ps(); + do_512_compressstore_pd(); + + if (n_errs) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/compress.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/compress.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/compress.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/convert.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/convert.c +++ SingleSource/UnitTests/Vector/AVX512F/convert.c @@ -0,0 +1,413 @@ +/* + * Exercise some convert instructions. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_cvt_roundph_ps() + * _mm512_mask_cvt_roundph_ps() + * _mm512_maskz_cvt_roundph_ps() + * _mm512_cvtph_ps() + * _mm512_mask_cvtph_ps() + * _mm512_maskz_cvtph_ps() + * _mm512_cvt_roundps_ph() + * _mm512_mask_cvt_roundps_ph() + * _mm512_maskz_cvt_roundps_ph() + * _mm512_cvtps_ph() + * _mm512_mask_cvtps_ph() + * _mm512_maskz_cvtps_ph() + */ + +#include +#include +#include + +typedef union V256 { + __m128i m128i; + __m128 m128; + __m128d m128d; + __m256 m256; + __m256d m256d; + __m256i m256i; + short w[16]; + int d[8]; + long long q[4]; + float ps[8]; + double pd[4]; + + int i32; + unsigned int u32; + __int64 i64; + unsigned __int64 u64; +} V256; + +int n_errors = 0; + +void print(const char *str, int num_elts, int elt_size, V256 *p, int is_float) { + int i; + + if (elt_size == 2 && is_float) { + if (num_elts == 4) { + p->m128 = _mm_cvtph_ps(p->m128i); + } else { + p->m256 = _mm256_cvtph_ps(p->m128i); + } + } + + printf("%s = {", str); + for (i = 0; i < num_elts; i++) { + if (!is_float) { + int val; + switch (elt_size) { + case 2: + val = p->w[i]; + break; + case 4: + val = p->d[i]; + break; + case 8: + val = p->q[i]; + break; + } + printf("%s %3d", i == 0 ? "" : ",", val); + } else { + float val; + switch (elt_size) { + case 2: + val = p->ps[i]; + break; + case 4: + val = p->ps[i]; + break; + case 8: + val = p->pd[i]; + break; + } + printf("%s %.3f", i == 0 ? "" : ",", val); + } + } + printf("}\n"); +} + +__declspec(noinline) void check(int is_float, int elt_size, int num_elts, + void *v1, void *v2, const char *str) { + if (memcmp(v1, v2, elt_size * num_elts) != 0) { + ++n_errors; + printf("FAILED: %dx%d (%s)\n", elt_size, num_elts, str); + + print("exp", num_elts, elt_size, v1, is_float); + print("got", num_elts, elt_size, v2, is_float); + } +} + +#define r _MM_FROUND_NO_EXC +int mask = 0xAAA; // b101010101010 + +void float16_converts() { + +#define M512 _mm512_set_ps + +#define M512_RES M512(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) +#define M512_PASS \ + M512(11, 22, 33, 44, 55, 66, 77, 88, 99, 1010, 1111, 1212, 1313, 1414, 1515, \ + 1616) +#define M512_RES_MASK \ + M512(11, 22, 33, 44, 5, 66, 7, 88, 9, 1010, 11, 1212, 13, 1414, 15, 1616) +#define M512_RES_MASKZ M512(0, 0, 0, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0) + +#define M256H_SRC _mm512_cvt_roundps_ph(M512_RES, r) + + { + __m512 got = _mm512_cvt_roundph_ps(M256H_SRC, r); + __m512 exp = M512_RES; + check(1, 4, 16, &exp, &got, "_mm512_cvt_roundph_ps"); + } + { + __m512 got = _mm512_mask_cvt_roundph_ps(M512_PASS, mask, M256H_SRC, r); + __m512 exp = M512_RES_MASK; + check(1, 4, 16, &exp, &got, "_mm512_mask_cvt_roundph_ps"); + } + { + __m512 got = _mm512_maskz_cvt_roundph_ps(mask, M256H_SRC, r); + __m512 exp = M512_RES_MASKZ; + check(1, 4, 16, &exp, &got, "_mm512_maskz_cvt_roundph_ps"); + } + + { + __m512 got = _mm512_cvtph_ps(M256H_SRC); + __m512 exp = M512_RES; + check(1, 4, 16, &exp, &got, "_mm512_cvtph_ps"); + } + { + __m512 got = _mm512_mask_cvtph_ps(M512_PASS, mask, M256H_SRC); + __m512 exp = M512_RES_MASK; + check(1, 4, 16, &exp, &got, "_mm512_mask_cvtph_ps"); + } + { + __m512 got = _mm512_maskz_cvtph_ps(mask, M256H_SRC); + __m512 exp = M512_RES_MASKZ; + check(1, 4, 16, &exp, &got, "_mm512_maskz_cvtph_ps"); + } + +#define M512_SRC M512(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + +#define M256H(m512) _mm512_cvt_roundps_ph(m512, r) + +#define M256H_PASS \ + M256H(M512(11, 22, 33, 44, 55, 66, 77, 88, 99, 1010, 1111, 1212, 1313, 1414, \ + 1515, 1616)) +#define M256H_RES M256H(M512_SRC) +#define M256H_RES_MASK \ + M256H(M512(11, 22, 33, 44, 5, 66, 7, 88, 9, 1010, 11, 1212, 13, 1414, 15, \ + 1616)) +#define M256H_RES_MASKZ \ + M256H(M512(0, 0, 0, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0)) + + { + V256 got, exp; + got.m256i = _mm512_cvt_roundps_ph(M512_SRC, r); + exp.m256i = M256H_RES; + check(1, 2, 16, &exp, &got, "_mm512_cvt_roundps_ph"); + } + { + V256 got, exp; + got.m256i = _mm512_mask_cvt_roundps_ph(M256H_PASS, mask, M512_SRC, r); + exp.m256i = M256H_RES_MASK; + check(1, 2, 16, &exp, &got, "_mm512_mask_cvt_roundps_ph"); + } + { + V256 got, exp; + got.m256i = _mm512_maskz_cvt_roundps_ph(mask, M512_SRC, r); + exp.m256i = M256H_RES_MASKZ; + check(1, 2, 16, &exp, &got, "_mm512_maskz_cvt_roundps_ph"); + } + + { + V256 got, exp; + got.m256i = _mm512_cvtps_ph(M512_SRC, r); + exp.m256i = M256H_RES; + check(1, 2, 16, &exp, &got, "_mm512_cvtps_ph"); + } + { + V256 got, exp; + got.m256i = _mm512_mask_cvtps_ph(M256H_PASS, mask, M512_SRC, r); + exp.m256i = M256H_RES_MASK; + check(1, 2, 16, &exp, &got, "_mm512_mask_cvtps_ph"); + } + { + V256 got, exp; + got.m256i = _mm512_maskz_cvtps_ph(mask, M512_SRC, r); + exp.m256i = M256H_RES_MASKZ; + check(1, 2, 16, &exp, &got, "_mm512_maskz_cvtps_ph"); + } +} + +__declspec(noinline) void scalar_converts() { + +#define M128SD(scalar) _mm_set_pd(123, scalar) + +#define CHECK__(core, src, res_type, res) \ + { \ + V256 got, exp; \ + ((got).res_type) = _mm_##core src; \ + ((exp).res_type) = res; \ + check(0, sizeof((exp).res_type), 1, &exp, &got, "_mm_" #core); \ + } + +#if defined(__x86_64) || defined(_M_X64) +#define CHECK64 CHECK__ +#else +#define CHECK64(core, src, res_type, res) +#endif + +#undef R +#define R _MM_FROUND_FLOOR | _MM_FROUND_NO_EXC + + CHECK__(cvt_roundsd_i32, (M128SD(100.7), R), i32, 100) + CHECK__(cvtsd_i32, (M128SD(100.7)), i32, 101) + CHECK__(cvtsd_si32, (M128SD(100.7)), i32, 101) + CHECK__(cvt_roundsd_u32, (M128SD(100.7), R), u32, 100) + CHECK__(cvtsd_u32, (M128SD(100.7)), u32, 101) + + CHECK64(cvt_roundsd_i64, (M128SD(100.7), R), i64, 100) + CHECK64(cvtsd_i64, (M128SD(100.7)), i64, 101) + CHECK64(cvtsd_si64, (M128SD(100.7)), i64, 101) + CHECK64(cvt_roundsd_u64, (M128SD(100.7), R), u64, 100) + CHECK64(cvtsd_u64, (M128SD(100.7)), u64, 101) + +#undef R +#define R _MM_FROUND_NO_EXC + + CHECK__(cvtt_roundsd_i32, (M128SD(100.7), R), i32, 100) + CHECK__(cvttsd_i32, (M128SD(100.7)), i32, 100) + CHECK__(cvttsd_si32, (M128SD(100.7)), i32, 100) + CHECK__(cvtt_roundsd_u32, (M128SD(100.7), R), u32, 100) + CHECK__(cvttsd_u32, (M128SD(100.7)), u32, 100) + + CHECK64(cvtt_roundsd_i64, (M128SD(100.7), R), i64, 100) + CHECK64(cvttsd_i64, (M128SD(100.7)), i64, 100) + CHECK64(cvttsd_si64, (M128SD(100.7)), i64, 100) + CHECK64(cvtt_roundsd_u64, (M128SD(100.7), R), u64, 100) + CHECK64(cvttsd_u64, (M128SD(100.7)), u64, 100) + + CHECK64(cvt_roundi64_sd, (M128SD(100.7), 35, R), m128d, M128SD(35)) + CHECK64(cvt_roundsi64_sd, (M128SD(100.7), 35, R), m128d, M128SD(35)) + CHECK64(cvt_roundu64_sd, (M128SD(100.7), 35, R), m128d, M128SD(35)) + CHECK64(cvti64_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + CHECK64(cvtsi64_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + CHECK64(cvtu64_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + + // Rounding not supported for [ui]32->sd + CHECK__(cvti32_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + CHECK__(cvtsi32_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + CHECK__(cvtu32_sd, (M128SD(100.7), 35), m128d, M128SD(35)) + +#define M128SS(scalar) _mm_set_ps(1, 2, 3, scalar) +#undef R +#define R _MM_FROUND_FLOOR | _MM_FROUND_NO_EXC + + CHECK__(cvt_roundss_i32, (M128SS(100.7), R), i32, 100) + CHECK__(cvt_roundss_u32, (M128SS(100.7), R), u32, 100) + CHECK__(cvtss_i32, (M128SS(100.7)), i32, 101) + CHECK__(cvtss_si32, (M128SS(100.7)), i32, 101) + CHECK__(cvtss_u32, (M128SS(100.7)), u32, 101) + + CHECK64(cvt_roundss_i64, (M128SS(100.7), R), i64, 100) + CHECK64(cvt_roundss_u64, (M128SS(100.7), R), u64, 100) + CHECK64(cvtss_i64, (M128SS(100.7)), i64, 101) + CHECK64(cvtss_si64, (M128SS(100.7)), i64, 101) + CHECK64(cvtss_u64, (M128SS(100.7)), u64, 101) + +#undef R +#define R _MM_FROUND_NO_EXC + + CHECK__(cvtt_roundss_i32, (M128SS(100.7), R), i32, 100) + CHECK__(cvtt_roundss_u32, (M128SS(100.7), R), u32, 100) + CHECK__(cvttss_i32, (M128SS(100.7)), i32, 100) + CHECK__(cvttss_si32, (M128SS(100.7)), i32, 100) + CHECK__(cvttss_u32, (M128SS(100.7)), u32, 100) + + CHECK64(cvtt_roundss_i64, (M128SS(100.7), R), i64, 100) + CHECK64(cvtt_roundss_u64, (M128SS(100.7), R), u64, 100) + CHECK64(cvttss_i64, (M128SS(100.7)), i64, 100) + CHECK64(cvttss_si64, (M128SS(100.7)), i64, 100) + CHECK64(cvttss_u64, (M128SS(100.7)), u64, 100) + + CHECK__(cvt_roundi32_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK__(cvt_roundsi32_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK__(cvt_roundu32_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK__(cvti32_ss, (M128SS(100.7), 47), m128, M128SS(47)) + CHECK__(cvtsi32_ss, (M128SS(100.7), 47), m128, M128SS(47)) + CHECK__(cvtu32_ss, (M128SS(100.7), 47), m128, M128SS(47)) + + CHECK64(cvt_roundi64_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK64(cvt_roundsi64_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK64(cvt_roundu64_ss, (M128SS(100.7), 47, R), m128, M128SS(47)) + CHECK64(cvti64_ss, (M128SS(100.7), 47), m128, M128SS(47)) + CHECK64(cvtsi64_ss, (M128SS(100.7), 47), m128, M128SS(47)) + CHECK64(cvtu64_ss, (M128SS(100.7), 47), m128, M128SS(47)) + +#undef R +#define R _MM_FROUND_NO_EXC + +#define CHECK_M128D(core, src, res_type, res) \ + { \ + V256 got, exp; \ + ((got).res_type) = _mm_##core src; \ + ((exp).res_type) = res; \ + check(1, 8, 2, &exp, &got, "_mm_" #core); \ + } + +#define M128D(a, b) _mm_set_pd(a, b) + + CHECK_M128D(cvt_roundss_sd, (M128D(1, 11) /*src1*/, M128SS(51) /*src2*/, R), + m128d, M128D(1, 51)) + CHECK_M128D(cvtss_sd, (M128D(1, 11), M128SS(51)), m128d, M128D(1, 51)) + + // For masked operations we check both 0 and 1 masks + // + CHECK_M128D( + mask_cvt_roundss_sd, + (M128D(1, 11) /*dest*/, 1, M128D(2, 22) /*src1*/, M128SS(51) /*src2*/, R), + m128d, M128D(2, 51)) + CHECK_M128D(mask_cvt_roundss_sd, + (M128D(1, 11), 0, M128D(2, 22), M128SS(51), R), m128d, + M128D(2, 11)) + + CHECK_M128D( + mask_cvtss_sd, + (M128D(1, 11) /*dest*/, 1, M128D(2, 22) /*src1*/, M128SS(51) /*src2*/), + m128d, M128D(2, 51)) + CHECK_M128D(mask_cvtss_sd, (M128D(1, 11), 0, M128D(2, 22), M128SS(51)), m128d, + M128D(2, 11)) + + CHECK_M128D(maskz_cvt_roundss_sd, + (1, M128D(2, 22) /*src1*/, M128SS(51) /*src2*/, R), m128d, + M128D(2, 51)) + CHECK_M128D(maskz_cvt_roundss_sd, (0, M128D(2, 22), M128SS(51), R), m128d, + M128D(2, 0)) + + CHECK_M128D(maskz_cvtss_sd, (1, M128D(2, 22) /*src1*/, M128SS(51) /*src2*/), + m128d, M128D(2, 51)) + CHECK_M128D(maskz_cvtss_sd, (0, M128D(2, 22), M128SS(51)), m128d, M128D(2, 0)) + +#define M128(a, b, c, d) _mm_set_ps(a, b, c, d) +#define CHECK_M128(core, src, res_type, res) \ + { \ + V256 got, exp; \ + ((got).res_type) = _mm_##core src; \ + ((exp).res_type) = res; \ + check(1, 4, 4, &exp, &got, "_mm_" #core); \ + } + + CHECK_M128(cvt_roundsd_ss, + (M128(1, 11, 111, 1111) /*src1*/, M128D(2, 22) /*src2*/, R), m128, + M128(1, 11, 111, 22)) + CHECK_M128(cvtsd_ss, (M128(1, 11, 111, 1111), M128D(2, 22)), m128, + M128(1, 11, 111, 22)) + + // For masked operations we check both 0 and 1 masks + // + CHECK_M128(mask_cvt_roundsd_ss, + (M128(1, 11, 111, 1111) /*dest*/, 1, + M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/, R), + m128, M128(2, 22, 222, 33)) + CHECK_M128(mask_cvt_roundsd_ss, + (M128(1, 11, 111, 1111) /*dest*/, 0, + M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/, R), + m128, M128(2, 22, 222, 1111)) + + CHECK_M128(mask_cvtsd_ss, + (M128(1, 11, 111, 1111) /*dest*/, 1, + M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/), + m128, M128(2, 22, 222, 33)) + CHECK_M128(mask_cvtsd_ss, + (M128(1, 11, 111, 1111) /*dest*/, 0, + M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/), + m128, M128(2, 22, 222, 1111)) + + CHECK_M128(maskz_cvt_roundsd_ss, + (1, M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/, R), + m128, M128(2, 22, 222, 33)) + CHECK_M128(maskz_cvt_roundsd_ss, + (0, M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/, R), + m128, M128(2, 22, 222, 0)) + + CHECK_M128(maskz_cvtsd_ss, + (1, M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/), m128, + M128(2, 22, 222, 33)) + CHECK_M128(maskz_cvtsd_ss, + (0, M128(2, 22, 222, 2222) /*src1*/, M128D(3, 33) /*src2*/), m128, + M128(2, 22, 222, 0)) +} + +int main(void) { + float16_converts(); + scalar_converts(); + + if (n_errors) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/convert.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/convert.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/convert.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/expand_compress.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/expand_compress.c +++ SingleSource/UnitTests/Vector/AVX512F/expand_compress.c @@ -0,0 +1,515 @@ +/* + * Tests for expand intrinsics family. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_mask_compress*() + * _mm512_mask_compressstoreu*() + * _mm512_mask_expand*() + * _mm512_mask_expandloadu*() + * _mm512_maskz_compress*() + * _mm512_maskz_expand*() + * _mm512_maskz_expandloadu*() + */ + +#include "m512_test_util.h" +#include +#include + +volatile __int64 vol0; + +V512 isrc1; +V512 isrc2; + +V512 fsrc1; +V512 fsrc2; + +V512 dsrc1; +V512 dsrc2; + +V512 res; +V512 mres; + +__mmask8 k8; +__mmask16 k16; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + isrc1.s32[i] = i; + isrc2.s32[i] = i + 1; + + fsrc1.f32[i] = i * 1.0f; + fsrc2.f32[i] = i * 2.0f; + } + + for (i = 0; i < 8; i++) { + dsrc1.f64[i] = i * 4.0; + dsrc2.f64[i] = i * 5.0; + } + + k8 = 0x5a; + k16 = 0x25d6; +} + +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_isrc1_update() isrc1.xmmi[vol0] = isrc1.xmmi[vol0] +#define soft_fsrc1_update() fsrc1.xmmi[vol0] = fsrc1.xmmi[vol0] +#define soft_dsrc1_update() dsrc1.xmmi[vol0] = dsrc1.xmmi[vol0] + +/* + * Model expand intrinsic behavior. + */ +void NOINLINE model_mask_expand_i32(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + int *v1i = (int *)input1; + int *v2i = (int *)input2; + int *v3o = (int *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = v1i[i]; + } + } +} + +void NOINLINE model_maskz_expand_i32(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + int *v2i = (int *)input2; + int *v3o = (int *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = 0; + } + } +} + +void NOINLINE model_mask_expand_i64(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + __int64 *v1i = (__int64 *)input1; + __int64 *v2i = (__int64 *)input2; + __int64 *v3o = (__int64 *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = v1i[i]; + } + } +} + +void NOINLINE model_maskz_expand_i64(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + __int64 *v2i = (__int64 *)input2; + __int64 *v3o = (__int64 *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = 0; + } + } +} + +void NOINLINE model_mask_expand_f32(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + float *v1i = (float *)input1; + float *v2i = (float *)input2; + float *v3o = (float *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = v1i[i]; + } + } +} + +void NOINLINE model_maskz_expand_f32(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + float *v2i = (float *)input2; + float *v3o = (float *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = 0.f; + } + } +} + +void NOINLINE model_mask_expand_f64(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + double *v1i = (double *)input1; + double *v2i = (double *)input2; + double *v3o = (double *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = v1i[i]; + } + } +} + +void NOINLINE model_maskz_expand_f64(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + double *v2i = (double *)input2; + double *v3o = (double *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[i] = v2i[j]; + j++; + } else { + v3o[i] = 0.; + } + } +} + +#define GEN_MASK_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, modeller, \ + checker) \ + res.suffix = intrin(prefix##src2.suffix, mask, prefix##src1.suffix); \ + modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix, \ + (void *)&mres.suffix, n_elem); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_MASK_LOAD_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, \ + modeller, checker) \ + res.suffix = intrin(prefix##src2.suffix, mask, &prefix##src1.suffix); \ + modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix, \ + (void *)&mres.suffix, n_elem); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_MASKZ_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, modeller, \ + checker) \ + res.suffix = intrin(mask, prefix##src1.suffix); \ + modeller(mask, (void *)&prefix##src1.suffix, (void *)&mres.suffix, n_elem); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_MASKZ_LOAD_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, \ + modeller, checker) \ + res.suffix = intrin(mask, &prefix##src1.suffix); \ + modeller(mask, (void *)&prefix##src1.suffix, (void *)&mres.suffix, n_elem); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +void NOINLINE do_m512_expand() { + volatile V512 res; + + soft_isrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_expand_epi32, i, zmmi, k16, 16, + model_mask_expand_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_epi32, i, zmmi, k16, 16, + model_mask_expand_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_epi32, i, zmmi, k16, 16, + model_maskz_expand_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_epi32, i, zmmi, k16, 16, + model_maskz_expand_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_expand_epi64, i, zmmi, k8, 8, + model_mask_expand_i64, check_equal_nq); + + soft_isrc1_update(); + GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_epi64, i, zmmi, k8, 8, + model_mask_expand_i64, check_equal_nq); + + soft_isrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_epi64, i, zmmi, k8, 8, + model_maskz_expand_i64, check_equal_nq); + + soft_isrc1_update(); + GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_epi64, i, zmmi, k8, 8, + model_maskz_expand_i64, check_equal_nq); + + soft_fsrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_expand_ps, f, zmm, k16, 16, + model_mask_expand_f32, check_equal_nsf); + + soft_fsrc1_update(); + GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_ps, f, zmm, k16, 16, + model_mask_expand_f32, check_equal_nsf); + + soft_fsrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_ps, f, zmm, k16, 16, + model_maskz_expand_f32, check_equal_nsf); + + soft_fsrc1_update(); + GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_ps, f, zmm, k16, 16, + model_maskz_expand_f32, check_equal_nsf); + + soft_dsrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_expand_pd, d, zmmd, k8, 8, + model_mask_expand_f64, check_equal_ndf); + + soft_dsrc1_update(); + GEN_MASK_LOAD_CHECK_CASE(_mm512_mask_expandloadu_pd, d, zmmd, k8, 8, + model_mask_expand_f64, check_equal_ndf); + + soft_dsrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_expand_pd, d, zmmd, k8, 8, + model_maskz_expand_f64, check_equal_ndf); + + soft_dsrc1_update(); + GEN_MASKZ_LOAD_CHECK_CASE(_mm512_maskz_expandloadu_pd, d, zmmd, k8, 8, + model_maskz_expand_f64, check_equal_ndf); +} + +/* + * Model compress intrinsic behavior. + */ +void NOINLINE model_mask_compress_i32(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + int *v1i = (int *)input1; + int *v2i = (int *)input2; + int *v3o = (int *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = v1i[i]; + } +} + +void NOINLINE model_maskz_compress_i32(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + int *v2i = (int *)input2; + int *v3o = (int *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = 0; + } +} + +void NOINLINE model_mask_compress_i64(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + __int64 *v1i = (__int64 *)input1; + __int64 *v2i = (__int64 *)input2; + __int64 *v3o = (__int64 *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = v1i[i]; + } +} + +void NOINLINE model_maskz_compress_i64(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + __int64 *v2i = (__int64 *)input2; + __int64 *v3o = (__int64 *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = 0; + } +} + +void NOINLINE model_mask_compress_f32(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + float *v1i = (float *)input1; + float *v2i = (float *)input2; + float *v3o = (float *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = v1i[i]; + } +} + +void NOINLINE model_maskz_compress_f32(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + float *v2i = (float *)input2; + float *v3o = (float *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = 0; + } +} + +void NOINLINE model_mask_compress_f64(void *input1, __int64 mask, void *input2, + void *output, int n_elems) { + int i, j = 0; + double *v1i = (double *)input1; + double *v2i = (double *)input2; + double *v3o = (double *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = v1i[i]; + } +} + +void NOINLINE model_maskz_compress_f64(__int64 mask, void *input2, void *output, + int n_elems) { + int i, j = 0; + double *v2i = (double *)input2; + double *v3o = (double *)output; + + for (i = 0; i < n_elems; i++) { + if ((mask & (1LL << i)) != 0) { + v3o[j] = v2i[i]; + j++; + } + } + + for (i = j; i < n_elems; i++) { + v3o[i] = 0; + } +} + +#define GEN_MASK_STORE_CHECK_CASE(intrin, prefix, suffix, mask, n_elem, \ + modeller, checker) \ + intrin((void *)&res.suffix, mask, prefix##src1.suffix); \ + modeller((void *)&prefix##src2.suffix, mask, (void *)&prefix##src1.suffix, \ + (void *)&mres.suffix, n_elem); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +void NOINLINE do_m512_compress() { + volatile V512 res; + + soft_isrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_compress_epi32, i, zmmi, k16, 16, + model_mask_compress_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_epi32, i, zmmi, k16, 16, + model_mask_compress_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_epi32, i, zmmi, k16, 16, + model_maskz_compress_i32, check_equal_nd); + + soft_isrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_compress_epi64, i, zmmi, k8, 8, + model_mask_compress_i64, check_equal_nq); + + soft_isrc1_update(); + GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_epi64, i, zmmi, k8, 8, + model_mask_compress_i64, check_equal_nq); + + soft_isrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_epi64, i, zmmi, k8, 8, + model_maskz_compress_i64, check_equal_nq); + + soft_fsrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_compress_ps, f, zmm, k16, 16, + model_mask_compress_f32, check_equal_nsf); + + soft_fsrc1_update(); + GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_ps, f, zmm, k16, 16, + model_mask_compress_f32, check_equal_nsf); + + soft_fsrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_ps, f, zmm, k16, 16, + model_maskz_compress_f32, check_equal_nsf); + + soft_dsrc1_update(); + GEN_MASK_CHECK_CASE(_mm512_mask_compress_pd, d, zmmd, k8, 8, + model_mask_compress_f64, check_equal_ndf); + + soft_dsrc1_update(); + GEN_MASK_STORE_CHECK_CASE(_mm512_mask_compressstoreu_pd, d, zmmd, k8, 8, + model_mask_compress_f64, check_equal_ndf); + + soft_dsrc1_update(); + GEN_MASKZ_CHECK_CASE(_mm512_maskz_compress_pd, d, zmmd, k8, 8, + model_maskz_compress_f64, check_equal_ndf); +} + +int main() { + init(); + + do_m512_expand(); + do_m512_compress(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/expand_compress.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/expand_compress.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/expand_compress.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/extract.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/extract.c +++ SingleSource/UnitTests/Vector/AVX512F/extract.c @@ -0,0 +1,204 @@ +/* + * Tests for extract intrinsics family. + * Here we check for _mm512_[mask|maskz]_extract[f|i] intrinsics. + */ + +#include "m512_test_util.h" +#include +#include + +volatile __int64 vol0; + +V512 isrc1; +V512 isrc2; + +V512 fsrc1; +V512 fsrc2; + +V512 dsrc1; +V512 dsrc2; + +V512 res; +V512 mres; + +__mmask8 k8; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + isrc1.s32[i] = i; + isrc2.s32[i] = i + 1; + + fsrc1.f32[i] = i * 1.0f; + fsrc2.f32[i] = i * 2.0f; + } + + for (i = 0; i < 8; i++) { + dsrc1.f64[i] = i * 4.0; + dsrc2.f64[i] = i * 5.0; + } + + k8 = 0x5a; +} + +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_isrc1_update() isrc1.xmmi[vol0] = isrc1.xmmi[vol0] +#define soft_fsrc1_update() fsrc1.xmmi[vol0] = fsrc1.xmmi[vol0] +#define soft_dsrc1_update() dsrc1.xmmi[vol0] = dsrc1.xmmi[vol0] + +/* + * Model extract intrinsic behavior. + */ +V512 NOINLINE model_mask_extract_32x4(V512 input1, __mmask8 mask, V512 input2, + int selector) { + V512 tmp, lres; + int i; + + tmp.xmm[0] = input2.xmm[selector]; + + for (i = 0; i < 4; i++) { + if ((mask & (1LL << i)) != 0) { + lres.s32[i] = tmp.s32[i]; + } else { + lres.s32[i] = input1.s32[i]; + } + } + + return lres; +} + +V512 NOINLINE model_maskz_extract_32x4(__mmask8 mask, V512 input2, + int selector) { + V512 tmp, lres; + int i; + + tmp.xmm[0] = input2.xmm[selector]; + + for (i = 0; i < 4; i++) { + if ((mask & (1LL << i)) != 0) { + lres.s32[i] = tmp.s32[i]; + } else { + lres.s32[i] = 0; + } + } + + return lres; +} + +V512 NOINLINE model_mask_extract_64x4(V512 input1, __mmask8 mask, V512 input2, + int selector) { + V512 tmp, lres; + int i; + + tmp.ymm[0] = input2.ymm[selector]; + + for (i = 0; i < 4; i++) { + if ((mask & (1LL << i)) != 0) { + lres.s64[i] = tmp.s64[i]; + } else { + lres.s64[i] = input1.s64[i]; + } + } + + return lres; +} + +V512 NOINLINE model_maskz_extract_64x4(__mmask8 mask, V512 input2, + int selector) { + V512 tmp, lres; + int i; + + tmp.ymm[0] = input2.ymm[selector]; + + for (i = 0; i < 4; i++) { + if ((mask & (1LL << i)) != 0) { + lres.s64[i] = tmp.s64[i]; + } else { + lres.s64[i] = 0; + } + } + + return lres; +} + +#define GEN_CHECK_CASE_SEL(intrin, prefix, suffix, ress, n_elem, modeller, \ + checker, selector) \ + res.ress[0] = intrin(prefix##src1.suffix, selector); \ + mres = modeller(0xff, prefix##src1, selector); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_MASK_CHECK_CASE_SEL(intrin, prefix, suffix, ress, n_elem, \ + modeller, checker, selector) \ + res.ress[0] = \ + intrin(prefix##src2.ress[0], k8, prefix##src1.suffix, selector); \ + mres = modeller(prefix##src2, k8, prefix##src1, selector); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_MASKZ_CHECK_CASE_SEL(intrin, prefix, suffix, ress, n_elem, \ + modeller, checker, selector) \ + res.ress[0] = intrin(k8, prefix##src1.suffix, selector); \ + mres = modeller(k8, prefix##src1, selector); \ + checker((void *)&res.suffix, (void *)&mres.suffix, n_elem, #intrin, __LINE__) + +#define GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, \ + modeller_end, checker_end, selector) \ + GEN_CHECK_CASE_SEL(_mm512_##intrin_end, prefix, suffix, ress, n_elem, \ + model_maskz_##modeller_end, check_##checker_end, \ + selector); \ + GEN_MASK_CHECK_CASE_SEL(_mm512_mask_##intrin_end, prefix, suffix, ress, \ + n_elem, model_mask_##modeller_end, \ + check_##checker_end, selector); \ + GEN_MASKZ_CHECK_CASE_SEL(_mm512_maskz_##intrin_end, prefix, suffix, ress, \ + n_elem, model_maskz_##modeller_end, \ + check_##checker_end, selector) + +#define GEN_CHECK_CASE_3_2(intrin_end, prefix, suffix, ress, n_elem, \ + modeller_end, checker_end) \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 0); \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 1) + +#define GEN_CHECK_CASE_3_4(intrin_end, prefix, suffix, ress, n_elem, \ + modeller_end, checker_end) \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 0); \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 1); \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 2); \ + GEN_CHECK_CASE_3_SEL(intrin_end, prefix, suffix, ress, n_elem, modeller_end, \ + checker_end, 3) + +void NOINLINE do_m512_extract() { + soft_fsrc1_update(); + GEN_CHECK_CASE_3_4(extractf32x4_ps, f, zmm, xmm, 4, extract_32x4, equal_nsf); + soft_isrc1_update(); + GEN_CHECK_CASE_3_4(extracti32x4_epi32, i, zmmi, xmmi, 4, extract_32x4, + equal_nd); + + soft_fsrc1_update(); + GEN_CHECK_CASE_3_2(extractf64x4_pd, d, zmmd, ymmd, 2, extract_64x4, + equal_ndf); + soft_isrc1_update(); + GEN_CHECK_CASE_3_2(extracti64x4_epi64, i, zmmi, ymmi, 2, extract_64x4, + equal_nq); +} + +int main() { + init(); + + do_m512_extract(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/extract.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/extract.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/extract.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/getmant.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/getmant.c +++ SingleSource/UnitTests/Vector/AVX512F/getmant.c @@ -0,0 +1,430 @@ +/* + * Here we check for _mm512_[mask|maskz]_getmant_[round]_[pd|ps|ss|sd] + * intrinsics. + */ + +#include "m512_test_util.h" +#include +#include +#include + +volatile int vol0 = 0; + +V512 zeros; +V512 f64; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + zeros.s32[i] = 0; + } + + for (i = 0; i < 8; i++) { + f64.f64[i] = i; + } +} + +float getmant_ps(float src, int interval, int sc) { + union { + struct { + unsigned int fraction : 23; + unsigned int exp : 8; + unsigned int sign : 1; + } st; + float value; + } dst; + dst.value = src; + // Get sign bit + dst.st.sign = (sc == _MM_MANT_SIGN_zero) ? 0 : dst.st.sign; + + int isZero = (dst.st.exp == 0) && (dst.st.fraction == 0); + + int isDenorm = (dst.st.exp == 0) && (dst.st.fraction != 0); + + int isInfinite = (dst.st.exp == 0x0FF) && (dst.st.fraction == 0); + + int isNaN = (dst.st.exp == 0x0FF) && (dst.st.fraction != 0); + + // Check for NAN operand + + if (isNaN) { + return NAN; + } + + // Check for Zero and Infinite operands + if ((isZero) || (isInfinite)) { + dst.st.exp = 0x07F; + return dst.value; + } + + // Check for negative operand (including -0.0) + if ((dst.st.sign == 1) && sc == _MM_MANT_SIGN_nan) { + return NAN; + } + + // Checking for denormal operands + if (isDenorm) { + if (_MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON) + dst.st.fraction = 0; + else { + int j = 0; + dst.st.exp = 0x07F; + + while (j == 0) { + // normalize mantissa + j = (dst.st.fraction >> 22) & 0x1; + // Start normalizing the mantissa + dst.st.fraction = (dst.st.fraction << 1); + + // Adjust the exponent + dst.st.exp--; + } + } + } + + // fraction is normalized. + + // Checking for exponent response + int isUnbiased = dst.st.exp - 0x07F; + + // subtract the bias from exponent + + int isOddExp = isUnbiased & 0x1; + + // recognized unbiased ODD exponent + + int SignalingBit = (dst.st.fraction >> 22) & 0x1; + + switch (interval) { + case _MM_MANT_NORM_1_2: + dst.st.exp = 0x07F; + break; + case _MM_MANT_NORM_p5_2: + dst.st.exp = (isOddExp) ? 0x07E : 0x07F; + break; + case _MM_MANT_NORM_p5_1: + dst.st.exp = 0x07E; + break; + case _MM_MANT_NORM_p75_1p5: + dst.st.exp = (SignalingBit) ? 0x07E : 0x07F; + break; + } + return dst.value; +} + +double getmant_pd(double src, int interval, int sc) { + union { + struct { + unsigned long int fraction : 52; + unsigned int exp : 11; + unsigned int sign : 1; + } st; + double value; + } dst; + dst.value = src; + // Get sign bit + dst.st.sign = (sc == _MM_MANT_SIGN_zero) ? 0 : dst.st.sign; + + int isZero = (dst.st.exp == 0) && (dst.st.fraction == 0); + + int isDenorm = (dst.st.exp == 0) && (dst.st.fraction != 0); + + int isInfinite = (dst.st.exp == 0x7FF) && (dst.st.fraction == 0); + + int isNaN = (dst.st.exp == 0x7FF) && (dst.st.fraction != 0); + + // Check for NAN operand + + if (isNaN) { + return NAN; + } + + // Check for Zero and Infinite operands + if ((isZero) || (isInfinite)) { + dst.st.exp = 0x03FF; + return dst.value; + } + + // Check for negative operand (including -0.0) + if ((dst.st.sign == 1) && sc == _MM_MANT_SIGN_nan) { + return NAN; + } + + // Checking for denormal operands + if (isDenorm) { + if (_MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON) + dst.st.fraction = 0; + else { + int j = 0; + dst.st.exp = 0x03FF; + + while (j == 0) { + // normalize mantissa + j = (dst.st.fraction >> 51) & 0x1; + // Start normalizing the mantissa + dst.st.fraction = (dst.st.fraction << 1); + + // Adjust the exponent + dst.st.exp--; + } + } + } + + // fraction is normalized. + + // Checking for exponent response + int isUnbiased = dst.st.exp - 0x03FF; + + // subtract the bias from exponent + + int isOddExp = isUnbiased & 0x1; + + // recognized unbiased ODD exponent + + int SignalingBit = (dst.st.fraction >> 51) & 0x1; + + switch (interval) { + case _MM_MANT_NORM_1_2: + dst.st.exp = 0x3FF; + break; + case _MM_MANT_NORM_p5_2: + dst.st.exp = (isOddExp) ? 0x3FE : 0x3FF; + break; + case _MM_MANT_NORM_p5_1: + dst.st.exp = 0x3FE; + break; + case _MM_MANT_NORM_p75_1p5: + dst.st.exp = (SignalingBit) ? 0x3FE : 0x3FF; + break; + } + return dst.value; +} + +void emulate_getmant_ps(void *presult, const void *p, int size, int mask, + int zeromask, int interval, int sc) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + for (i = 0; i < size; i++) { + + if (((1 << i) & mask) == 0) { + if (zeromask) + result->f32[i] = 0; + else + result->f32[i] = result->f32[i]; + continue; + } + result->f32[i] = getmant_ps(v->f32[i], interval, sc); + } +} + +void emulate_getmant_pd(void *presult, const void *p, int size, int mask, + int zeromask, int interval, int sc) { + int i; + V512 *result = (V512 *)presult; + V512 *v = (V512 *)p; + for (i = 0; i < size; i++) { + + if (((1 << i) & mask) == 0) { + if (zeromask) + result->f64[i] = 0; + else + result->f64[i] = result->f64[i]; + continue; + } + result->f64[i] = getmant_pd(v->f64[i], interval, sc); + } +} + +void NOINLINE do_getmantpd() { + volatile __m512d zres, zmm; + volatile __m512d zexpected; + __mmask8 k = 0x75; + + zmm = f64.zmmd; + zres = _mm512_getmant_pd(zmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + emulate_getmant_pd(&zexpected, &zmm, 8, 0xffff, 0, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_getmant_pd", __LINE__); + + zres = _mm512_mask_getmant_pd(zres, k, zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_pd(&zexpected, &zmm, 8, k, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_mask_getmant_pd", __LINE__); + + zres = + _mm512_maskz_getmant_pd(k, zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero); + emulate_getmant_pd(&zexpected, &zmm, 8, k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_maksz_getmant_pd", __LINE__); + + zres = _mm512_getmant_round_pd(zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&zexpected, &zmm, 8, 0xffff, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_getmant_round_pd", __LINE__); + zres = _mm512_mask_getmant_round_pd(zres, k, zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&zexpected, &zmm, 8, k, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_mask_getmant_round_pd", + __LINE__); + zres = _mm512_maskz_getmant_round_pd( + k, zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&zexpected, &zmm, 8, k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&zres, &zexpected, 8, "_mm512_maskz_getmant_round_pd", + __LINE__); +} + +void NOINLINE do_getmantps() { + volatile __m512 zres, zmm; + volatile __m512 zexpected; + __mmask16 k = 0x75; + + zmm = f64.zmm; + zres = _mm512_getmant_ps(zmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + emulate_getmant_ps(&zexpected, &zmm, 16, 0xffff, 0, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_getmant_ps", __LINE__); + + zres = _mm512_mask_getmant_ps(zres, k, zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_ps(&zexpected, &zmm, 16, k, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_mask_getmant_ps", __LINE__); + + zres = + _mm512_maskz_getmant_ps(k, zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero); + emulate_getmant_ps(&zexpected, &zmm, 16, k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_maskz_getmant_ps", __LINE__); + + zres = _mm512_getmant_round_ps(zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&zexpected, &zmm, 16, 0xffff, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_getmant_round_ps", __LINE__); + zres = _mm512_mask_getmant_round_ps(zres, k, zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&zexpected, &zmm, 16, k, 0, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_mask_getmant_round_pd", + __LINE__); + + zres = _mm512_maskz_getmant_round_ps( + k, zmm, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&zexpected, &zmm, 16, k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&zres, &zexpected, 16, "_mm512_maskz_getmant_round_pd", + __LINE__); +} + +void NOINLINE do_getmantss() { + volatile __m128 xres, xmm, xmm2; + volatile __m128 xexpected; + __mmask8 k = 0x75; + xmm2 = zeros.xmm[0]; + xmm = f64.xmm[0]; + xres = _mm_getmant_ss(xmm2, xmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1, 1, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_getmant_ss", __LINE__); + + xres = _mm_mask_getmant_ss(xres, k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_mask_getmant_ss", __LINE__); + + xres = _mm_maskz_getmant_ss(k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_maskz_getmant_ss", __LINE__); + + xres = _mm_getmant_round_ss(xmm2, xmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1, 1, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_getmant_round_ss", __LINE__); + + xres = + _mm_mask_getmant_round_ss(xres, k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_mask_getmant_round_ss", __LINE__); + + xres = + _mm_maskz_getmant_round_ss(k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_ps(&xexpected, &xmm, 4, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_nsf(&xres, &xexpected, 4, "_mm_maskz_getmant_round_ss", __LINE__); +} + +void NOINLINE do_getmantsd() { + volatile __m128d xres, xmm, xmm2; + volatile __m128d xexpected; + __mmask8 k = 0x75; + xmm2 = zeros.xmmd[0]; + xmm = f64.xmmd[0]; + xres = _mm_getmant_sd(xmm2, xmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1, 1, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_getmant_sd", __LINE__); + + xres = _mm_mask_getmant_sd(xres, k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_mask_getmant_sd", __LINE__); + + xres = _mm_maskz_getmant_sd(k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_maskz_getmant_sd", __LINE__); + + xres = _mm_getmant_round_sd(xmm2, xmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero, + _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1, 1, _MM_MANT_NORM_p5_1, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_getmant_round_sd", __LINE__); + + xres = + _mm_mask_getmant_round_sd(xres, k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_mask_getmant_round_sd", __LINE__); + + xres = + _mm_maskz_getmant_round_sd(k, xmm2, xmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero, _MM_FROUND_CUR_DIRECTION); + emulate_getmant_pd(&xexpected, &xmm, 2, 0x1 & k, 1, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); + check_equal_ndf(&xres, &xexpected, 2, "_mm_maskz_getmant_round_sd", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_getmantpd(); + do_getmantps(); + do_getmantsd(); + do_getmantss(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/getmant.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/getmant.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/getmant.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/inline_asm.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/inline_asm.c +++ SingleSource/UnitTests/Vector/AVX512F/inline_asm.c @@ -0,0 +1,54 @@ +/* + * Exercise a very simple inline asm sequence. + */ + +#include "m512_test_util.h" +#include +#include + +__m512d a; +__m512d b; +__m512d c; + +void NOINLINE foo() { + void *pa, *pb; + + pa = &a; + pb = &b; + + __asm { + mov FULL_IREG(ax), [pa] + mov FULL_IREG(dx), [pb] + vmovapd zmm0, ZMMWORD PTR [FULL_IREG(ax)] + vaddpd zmm2, zmm0, ZMMWORD PTR [FULL_IREG(dx)] + vmovapd ZMMWORD PTR [FULL_IREG(ax)], zmm2 + } +} + +static void NOINLINE init() { + int i; + double *pa, *pb, *pc; + + pa = (double *)&a; + pb = (double *)&b; + pc = (double *)&c; + + for (i = 0; i < sizeof(a) / sizeof(double); i++) { + *pa++ = i * i; + *pb++ = (i + 8) * (i + 8); + *pc++ = i * i + (i + 8) * (i + 8); + } +} + +int main() { + init(); + foo(); + + if (memcmp(&a, &c, sizeof(a)) != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/inline_asm.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/inline_asm.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/inline_asm.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/insert.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/insert.c +++ SingleSource/UnitTests/Vector/AVX512F/insert.c @@ -0,0 +1,298 @@ +/* + * Test 512 and 256-bit insert intrinsics taking 2 simd operands + * and an immediate, with masked and zero-masked forms. + * Here we check for _mm512_[mask|maskz]_insert[f|i] intrinsics. + */ + +#include "m512_test_util.h" +#include + +typedef int bool; +#define true 1 +#define false 0 + +#define CHECK_INSERT(opcode, res_bit_size, dtype, is_masked, mask, \ + is_zero_mask, imm, op2_bit_size, elem_bit_size) \ + { \ + int fail = 0; \ + /* Compute the expected result. */ \ + expect.zmmi = compute_insert(&expect, mask, is_zero_mask, imm, \ + &dtype##op1.zmmi, &dtype##op2.zmmi, \ + res_bit_size, op2_bit_size, elem_bit_size); \ + \ + /* Compare the obtained and expected results. */ \ + fail = \ + check_equal_n##dtype(&res, &expect, res_bit_size / elem_bit_size, \ + is_masked ? (is_zero_mask ? opcode " zero mask" \ + : opcode " blend mask") \ + : opcode " no mask", \ + __LINE__); \ + if (fail) { \ + printf("\n\nMASK: %x, IMM: %d\n\n", mask, imm); \ + printf("\n"); \ + display_p##dtype(&dtype##op1, "op1:", res_bit_size / elem_bit_size); \ + printf("\n"); \ + display_p##dtype(&dtype##op2, "op2:", op2_bit_size / elem_bit_size); \ + printf("\n"); \ + display_p##dtype(&dtype##res_orig, \ + "old:", res_bit_size / elem_bit_size); \ + printf("\n================================================\n\n"); \ + } \ + } + +#define ZINSERT(opcode, res_bit_size, intrin, dtype, op2_xy, vtype_suffix, \ + is_masked, mask, is_zero_mask, imm, op2_bit_size, \ + elem_bit_size) \ + { \ + if (is_masked) { \ + if (is_zero_mask) { \ + /* Zero masking */ \ + memset(&res, 0xFF, sizeof(res)); \ + res.zmm##vtype_suffix = _mm512_maskz_##intrin( \ + mask, dtype##op1.zmm##vtype_suffix, \ + dtype##op2.op2_xy##mm##vtype_suffix[0], imm); \ + } else { \ + /* Blend masking */ \ + memcpy(&res, &dtype##res_orig, sizeof(res)); \ + res.zmm##vtype_suffix = _mm512_mask_##intrin( \ + res.zmm##vtype_suffix, mask, dtype##op1.zmm##vtype_suffix, \ + dtype##op2.op2_xy##mm##vtype_suffix[0], imm); \ + } \ + } else { \ + /* No masking */ \ + memset(&res, 0x0, sizeof(res)); \ + res.zmm##vtype_suffix = \ + _mm512_##intrin(dtype##op1.zmm##vtype_suffix, \ + dtype##op2.op2_xy##mm##vtype_suffix[0], imm); \ + } \ + CHECK_INSERT(opcode, res_bit_size, dtype, is_masked, mask, is_zero_mask, \ + imm, op2_bit_size, elem_bit_size) \ + } + +volatile int vol0 = 0; + +V512 dop1, dop2, dres_orig; +V512 qop1, qop2, qres_orig; +V512 res, expect; + +volatile unsigned int dres_orig_arr[16] = { + 0x12345678, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, + 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff}; + +volatile U64 qres_orig_arr[8] = {0x1234567890abcdef, 0x1111111111111111, + 0x2222222222222222, 0x3333333333333333, + 0x4444444444444444, 0x5555555555555555, + 0x6666666666666666, 0x7777777777777777}; + +static void NOINLINE init() { + int i; + + // Operand vectors + for (i = 0; i < 16; i++) { + dop1.u32[i] = 0x11000000 + i; + } + for (i = 0; i < 16; i++) { + dop2.u32[i] = 0xFF000000 + i; + } + + for (i = 0; i < 8; i++) { + qop1.u64[i] = 0x1111000000000000 + i; + } + for (i = 0; i < 8; i++) { + qop2.u64[i] = 0xFFFF000000000000 + i; + } + + // Destructed operand vectors + memcpy((void *)&dres_orig, (void *)dres_orig_arr, 64); + memcpy((void *)&qres_orig, (void *)qres_orig_arr, 64); +} + +// +// Emulate the insert operation. +// +__m512i NOINLINE compute_insert(void *res, unsigned int mask, bool zero_mask, + int imm, const void *op1, const void *op2, + int res_bit_size, int op2_bit_size, + int elem_bit_size) { + V512 *vres = (V512 *)res; + V512 *vop1 = (V512 *)op1; + V512 *vop2 = (V512 *)op2; + + int res_idx, elem_mask; + int num_elems = res_bit_size / elem_bit_size; + + // Merge op1 and op2 into dest; we will do masking later. + // + if (res_bit_size == 512) { + // zmm dest. + memcpy((void *)&vres->zmmi, (void *)&vop1->zmmi, 64); + + if (op2_bit_size == 128) { + // xmm op2 + memcpy((void *)&vres->xmmi[imm], (void *)&vop2->xmmi[0], 16); + } else { + // ymm op2 + memcpy((void *)&vres->ymmi[imm], (void *)&vop2->ymmi[0], 32); + } + } else { + // ymm dest. + memcpy((void *)&vres->ymmi[0], (void *)&vop1->ymmi[0], 32); + + // xmm op2 + memcpy((void *)&vres->xmmi[imm], (void *)&vop2->xmmi[0], 16); + } + + // Apply masking. + // + res_idx = 0; + for (res_idx = 0; res_idx < num_elems; res_idx++) { + + elem_mask = mask & (1 << res_idx); + + // The merge above has taken care of the elem_mask == 1 case. + if (elem_mask == 0) { + if (zero_mask) { + // Zeroing behavior. + if (elem_bit_size == 32) { + vres->s32[res_idx] = 0; + } else { + vres->s64[res_idx] = 0; + } + } else { + // Blending behavior + if (elem_bit_size == 32) { + vres->s32[res_idx] = dres_orig.s32[res_idx]; + } else { + vres->s64[res_idx] = qres_orig.s64[res_idx]; + } + } + } + } + + return vres->zmmi; +} + +#define KMASK_NONE ((__mmask8)0xff) +#define K16MASK_NONE ((__mmask16)0xffff) + +#define K8MASK_32x16 ((__mmask16)0xaaaa) + +#define K8MASK_64x8 ((__mmask8)0xaa) + +// FLOAT operations +// ================ + +void NOINLINE do_zinsertf32x4() { + // zinsertf32x4 + // + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , false, K16MASK_NONE, false, + 0, 128, 32); + + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , true, K8MASK_32x16, false, + 0, 128, 32); + + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , true, K8MASK_32x16, true, 0, + 128, 32); + + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , true, K8MASK_32x16, true, 1, + 128, 32); + + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , true, K8MASK_32x16, true, 2, + 128, 32); + + /* + * No-op to inhibit PRE of loads to exercise mixing operations. + */ + dop2.s32[vol0] = dop2.s32[vol0]; + + ZINSERT("ZINSERTF32x4", 512, insertf32x4, d, x, , true, K8MASK_32x16, true, 3, + 128, 32); +} + +void NOINLINE do_zinsertf64x4() { + // zinsertf64x4 + // + ZINSERT("ZINSERTF64x4", 512, insertf64x4, q, y, d, false, KMASK_NONE, false, + 0, 256, 64); + + ZINSERT("ZINSERTF64x4", 512, insertf64x4, q, y, d, true, K8MASK_64x8, false, + 0, 256, 64); + + ZINSERT("ZINSERTF64x4", 512, insertf64x4, q, y, d, true, K8MASK_64x8, true, 0, + 256, 64); + + /* + * No-op to inhibit PRE of loads to exercise mixing operations. + */ + dop2.s64[vol0] = dop2.s64[vol0]; + + ZINSERT("ZINSERTF64x4", 512, insertf64x4, q, y, d, true, K8MASK_64x8, true, 1, + 256, 64); +} + +// INT operations +// ============== + +void NOINLINE do_zinserti32x4() { + // zinserti32x4 + // + ZINSERT("ZINSERTI32x4", 512, inserti32x4, d, x, i, false, K16MASK_NONE, false, + 0, 128, 32); + + ZINSERT("ZINSERTI32x4", 512, inserti32x4, d, x, i, true, K8MASK_32x16, false, + 1, 128, 32); + + ZINSERT("ZINSERTI32x4", 512, inserti32x4, d, x, i, true, K8MASK_32x16, true, + 2, 128, 32); + + /* + * No-op to inhibit PRE of loads to exercise mixing operations. + */ + dop2.s32[vol0] = dop2.s32[vol0]; + + ZINSERT("ZINSERTI32x4", 512, inserti32x4, d, x, i, true, K8MASK_32x16, true, + 3, 128, 32); +} + +void NOINLINE do_zinserti64x4() { + // zinserti64x4 + // + + ZINSERT("ZINSERTI64x4", 512, inserti64x4, q, y, i, false, KMASK_NONE, false, + 0, 256, 64); + + ZINSERT("ZINSERTI64x4", 512, inserti64x4, q, y, i, true, K8MASK_64x8, false, + 0, 256, 64); + + ZINSERT("ZINSERTI64x4", 512, inserti64x4, q, y, i, true, K8MASK_64x8, true, 0, + 256, 64); + + /* + * No-op to inhibit PRE of loads to exercise mixing operations. + */ + dop2.s64[vol0] = dop2.s64[vol0]; + + ZINSERT("ZINSERTI64x4", 512, inserti64x4, q, y, i, true, K8MASK_64x8, true, 1, + 256, 64); +} + +int main() { + init(); + + // FLOAT + do_zinsertf32x4(); + do_zinsertf64x4(); + + // INT + do_zinserti32x4(); + do_zinserti64x4(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/insert.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/insert.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/insert.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/m512_test_util.h =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/m512_test_util.h +++ SingleSource/UnitTests/Vector/AVX512F/m512_test_util.h @@ -1,258 +0,0 @@ -#ifndef M512_TEST_UTIL_H_INCLUDED -#define M512_TEST_UTIL_H_INCLUDED - -/* - * Common declarations useful for writing 512-bit unit tests. - */ - -#include -#include -#include -#include - -#define ALIGNTO(n) __declspec(align(n)) - -/* - * For purposes of unit tests it can be beneficial to suppress inlining - * simply so that only a single instance of a test function is emitted. - * Makes it easier to diff A/B assembly output. - */ -#define NOINLINE __declspec(noinline) - -/* - * FULL_IREG(ax) expands to either eax or rax depending on the target. - */ -#if defined(__x86_64) || defined(_M_X64) -#define FULL_IREG(reg) r##reg -#else -#define FULL_IREG(reg) e##reg -#endif - -/* Number of elements in an array. */ -#define ASIZE(a) (sizeof((a)) / sizeof((a)[0])) - -typedef __int64 I64; -typedef unsigned __int64 U64; - -typedef union ALIGNTO(64) { - - __m512 zmm; - __m512d zmmd; - __m512i zmmi; - - __m256 ymm[2]; - __m256d ymmd[2]; - __m256i ymmi[2]; - - __m128 xmm[4]; - __m128d xmmd[4]; - __m128i xmmi[4]; - - char c[64]; - signed char s8[64]; - unsigned char u8[64]; - short s16[32]; - unsigned short u16[32]; - int s32[16]; - unsigned int u32[16]; - float f32[16]; - I64 s64[8]; - U64 u64[8]; - double f64[8]; - -} V512; - -int n_errs = 0; - -/* - * Print the low N 32-bit unsigned integers from p. - */ - -void NOINLINE display_pd(const V512 *p, const char *banner, int n_elems) { - int i = 15; - - if (banner) { - printf("%s", banner); - } - - for (i = n_elems; i >= 0; i--) { - printf(" %0.8x", p->u32[i]); - if (i > 0 && i % 4 == 0) { - printf("\n"); - if (banner) { - printf("%*s", (int)strlen((void *)banner), ""); - } - } - } - printf("\n"); -} - -/* - * Print the low N 64-bit unsigned integers from p. - */ -void NOINLINE display_pq(const V512 *p, const char *banner, int n_elems) { - int i = 7; - - if (banner) { - printf("%s", banner); - } - - for (i = n_elems; i >= 0; i--) { - printf(" %0.16llx", p->u64[i]); - if (i > 0 && i % 4 == 0) { - printf("\n"); - if (banner) { - printf("%*s", (int)strlen((void *)banner), ""); - } - } - } - printf("\n"); -} - -/* - * Print the low N single precision floats from p. - */ - -void NOINLINE display_psf(const V512 *p, const char *banner, int n_elems) { - int i = 15; - - if (banner) { - printf("%s", banner); - } - - for (i = n_elems; i >= 0; i--) { - printf(" %7g", p->f32[i]); - if (i > 0 && i % 4 == 0) { - printf("\n"); - if (banner) { - printf("%*s", (int)strlen((void *)banner), ""); - } - } - } - printf("\n"); -} - -/* - * Print the low N double precision floats from p. - */ - -void NOINLINE display_pdf(const V512 *p, const char *banner, int n_elems) { - int i = 15; - - if (banner) { - printf("%s", banner); - } - - for (i = n_elems; i >= 0; i--) { - printf(" %7g", p->f64[i]); - if (i > 0 && i % 4 == 0) { - printf("\n"); - if (banner) { - printf("%*s", (int)strlen((void *)banner), ""); - } - } - } - printf("\n"); -} - -/* - * Check that the low N 32-bit elements of "got" and "expected" are the same. - */ -int NOINLINE check_equal_nd(void *got, void *expected, int n_elems, - char *banner, int line) { - int i, fail = 0; - V512 *v1 = (V512 *)got; - V512 *v2 = (V512 *)expected; - - for (i = 0; i < n_elems; i++) { - if (v1->u32[i] != v2->u32[i]) { - printf("ERROR(%d): %s failed at %d'th element: 0x%0.8x != 0x%0.8x\n", - line, banner ? banner : "", i, v1->u32[i], v2->u32[i]); - display_pd(got, "got:", n_elems); - display_pd(expected, "exp:", n_elems); - n_errs++; - fail = 1; - break; - } - } - return fail; -} - -/* - * Check that the low N 64-bit elements of "got" and "expected" are the same. - */ -int NOINLINE check_equal_nq(void *got, void *expected, int n_elems, - char *banner, int line) { - int i, fail = 0; - V512 *v1 = (V512 *)got; - V512 *v2 = (V512 *)expected; - - for (i = 0; i < n_elems; i++) { - if (v1->u64[i] != v2->u64[i]) { - printf( - "ERROR(%d): %s failed at %d'th element: 0x%0.16llx != 0x%0.16llx\n", - line, banner ? banner : "", i, v1->u64[i], v2->u64[i]); - display_pq(got, "got:", n_elems); - display_pq(expected, "exp:", n_elems); - n_errs++; - fail = 1; - break; - } - } - return fail; -} - -double delta = 1e-4; - -#define EQUAL_FP(v1, v2) \ - ((v1) < (v2) ? ((v2) - (v1) < delta) : ((v1) - (v2) < delta)) - -/* - * Check that the low N single precision float elements of "got" and "expected" - * are the same. - */ -int NOINLINE check_equal_nsf(void *got, void *expected, int n_elems, - char *banner, int line) { - int i, fail = 0; - V512 *v1 = (V512 *)got; - V512 *v2 = (V512 *)expected; - - for (i = 0; i < n_elems; i++) { - if (!EQUAL_FP(v1->f32[i], v2->f32[i])) { - printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, - banner ? banner : "", i, v1->f32[i], v2->f32[i]); - display_psf(got, "got:", n_elems); - display_psf(expected, "exp:", n_elems); - n_errs++; - fail = 1; - break; - } - } - return fail; -} - -/* - * Check that the low N double precision float elements of "got" and "expected" - * are the same. - */ -int NOINLINE check_equal_ndf(void *got, void *expected, int n_elems, - char *banner, int line) { - int i, fail = 0; - V512 *v1 = (V512 *)got; - V512 *v2 = (V512 *)expected; - - for (i = 0; i < n_elems; i++) { - if (!EQUAL_FP(v1->f64[i], v2->f64[i])) { - printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, - banner ? banner : "", i, v1->f64[i], v2->f64[i]); - display_pdf(got, "got:", n_elems); - display_pdf(expected, "exp:", n_elems); - n_errs++; - fail = 1; - break; - } - } - return fail; -} - -#endif /* M512_TEST_UTIL_H_INCLUDED */ Index: SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.c +++ SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.c @@ -0,0 +1,133 @@ +/* This test was created to check the correctness + * of the following intrinsics support: + * _mm_cvt_roundsd_si32 + * _mm_cvt_roundsd_si64 + * _mm_cvt_roundss_si32 + * _mm_cvt_roundss_si64 + * _mm_cvtt_roundsd_si32 + * _mm_cvtt_roundsd_si64 + * _mm_cvtt_roundss_si32 + * _mm_cvtt_roundss_si64 + */ +#include "m512_test_util.h" + +#define DO_SS2SI32(f, r, exp) \ + src = _mm_set_ss(f); \ + res = _mm_cvt_roundss_si32(src, r | _MM_FROUND_NO_EXC); \ + if (res != exp) { \ + printf("cvt_roundss_si32(%f) = %d (expected %d)\n", f, res, exp); \ + n_errs++; \ + } + +#define DO_SS2SI32T(f, r, exp) \ + src = _mm_set_ss(f); \ + res = _mm_cvtt_roundss_si32(src, r); \ + if (res != exp) { \ + printf("cvtt_roundss_si32(%f) = %d (expected %d)\n", f, res, exp); \ + n_errs++; \ + } + +#define DO_SS2SI64(f, r, exp) \ + src = _mm_set_ss(f); \ + lres = _mm_cvt_roundss_si64(src, r | _MM_FROUND_NO_EXC); \ + if (lres != exp) { \ + printf("cvt_roundss_si64(%f) = %lld (expected %lld)\n", f, lres, exp); \ + n_errs++; \ + } + +#define DO_SS2SI64T(f, r, exp) \ + src = _mm_set_ss(f); \ + lres = _mm_cvtt_roundss_si64(src, r); \ + if (lres != exp) { \ + printf("cvt_roundss_si64(%f) = %lld (expected %lld)\n", f, lres, exp); \ + n_errs++; \ + } + +#define DO_SD2SI32(f, r, exp) \ + dsrc = _mm_set_sd(f); \ + res = _mm_cvt_roundsd_si32(dsrc, r | _MM_FROUND_NO_EXC); \ + if (res != exp) { \ + printf("cvt_roundsd_si32(%f) = %d (expected %d)\n", f, res, exp); \ + n_errs++; \ + } + +#define DO_SD2SI32T(f, r, exp) \ + dsrc = _mm_set_sd(f); \ + res = _mm_cvtt_roundsd_si32(dsrc, r); \ + if (res != exp) { \ + printf("cvtt_roundsd_si32(%f) = %d (expected %d)\n", f, res, exp); \ + n_errs++; \ + } + +#define DO_SD2SI64(f, r, exp) \ + dsrc = _mm_set_sd(f); \ + lres = _mm_cvt_roundsd_si64(dsrc, r | _MM_FROUND_NO_EXC); \ + if (lres != exp) { \ + printf("cvt_roundsd_si64(%f) = %lld (expected %lld)\n", f, lres, exp); \ + n_errs++; \ + } + +#define DO_SD2SI64T(f, r, exp) \ + dsrc = _mm_set_sd(f); \ + lres = _mm_cvtt_roundsd_si64(dsrc, r); \ + if (lres != exp) { \ + printf("cvt_roundsd_si64(%f) = %lld (expected %lld)\n", f, lres, exp); \ + n_errs++; \ + } + +void NOINLINE do_cvt_roundss() { + __m128 src; + int res; + __int64 lres; + + DO_SS2SI32(1.6f, _MM_FROUND_NO_EXC, 2); + DO_SS2SI32(-1.6f, _MM_FROUND_TO_ZERO, -1); + DO_SS2SI32(-1.1f, _MM_FROUND_TO_NEG_INF, -2); + DO_SS2SI32(-1.1f, _MM_FROUND_TO_POS_INF, -1); + DO_SS2SI32(10.8f, _MM_FROUND_TO_ZERO, 10); + DO_SS2SI32T(1.6f, _MM_FROUND_NO_EXC, 1); + +#if defined(__x86_64) || defined(_M_X64) + DO_SS2SI64(1.6f, _MM_FROUND_NO_EXC, 2ll); + DO_SS2SI64(-1.6f, _MM_FROUND_TO_ZERO, -1ll); + DO_SS2SI64(-1.1f, _MM_FROUND_TO_NEG_INF, -2ll); + DO_SS2SI64(-1.1f, _MM_FROUND_TO_POS_INF, -1ll); + DO_SS2SI64(10.8f, _MM_FROUND_TO_ZERO, 10ll); + DO_SS2SI64T(1.6f, _MM_FROUND_NO_EXC, 1ll); +#endif +} + +void NOINLINE do_cvt_roundsd() { + __m128d dsrc; + int res; + __int64 lres; + + DO_SD2SI32(1.6, _MM_FROUND_NO_EXC, 2); + DO_SD2SI32(-1.6, _MM_FROUND_TO_ZERO, -1); + DO_SD2SI32(-1.1, _MM_FROUND_TO_NEG_INF, -2); + DO_SD2SI32(-1.1, _MM_FROUND_TO_POS_INF, -1); + DO_SD2SI32(10.8, _MM_FROUND_TO_ZERO, 10); + DO_SD2SI32T(1.6, _MM_FROUND_NO_EXC, 1); + +#if defined(__x86_64) || defined(_M_X64) + DO_SD2SI64(1.6, _MM_FROUND_NO_EXC, 2ll); + DO_SD2SI64(-1.6, _MM_FROUND_TO_ZERO, -1ll); + DO_SD2SI64(-1.1, _MM_FROUND_TO_NEG_INF, -2ll); + DO_SD2SI64(-1.1, _MM_FROUND_TO_POS_INF, -1ll); + DO_SD2SI64(10.8, _MM_FROUND_TO_ZERO, 10ll); + DO_SD2SI64T(1.6, _MM_FROUND_NO_EXC, 1ll); +#endif +} + +int main() { + do_cvt_roundss(); + do_cvt_roundsd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/mm_cvt_rounds.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/shift.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/shift.c +++ SingleSource/UnitTests/Vector/AVX512F/shift.c @@ -0,0 +1,153 @@ +/* + * Test shift instructions. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_sllv_epi32() + * _mm512_srav_epi32() + * _mm512_srlv_epi32() + */ + +#include "m512_test_util.h" +#include + +__m512i i1; +__m512i i2; +__m512i i3; +__m512i i4; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE init() { + /* Set shift counts in i1. */ + + int i; + V512 *v = (V512 *)&i1; + + for (i = 0; i < 16; i++) { + if ((i & 0x3) == 0) { + v->u32[i] = i + vol; + } else if ((i & 0x3) == 1) { + v->u32[i] = -(i + 1 + vol); + } else if ((i & 0x3) == 2) { + v->u32[i] = i + 31 + vol; + } else { + v->u32[i] = -(i + 31 + vol); + } + } + + /* Set random values in i2. */ + + v = (V512 *)&i2; + + for (i = 0; i < 64; i++) { + v->u8[i] = i * i - 3 * i + i + vol; + } +} + +void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 16; i++) { + int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i]; + if (got->s32[i] != ans) { + printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n", + banner ? banner : "", got->s32[i], ans, i); + n_errs++; + break; + } + } +} + +typedef enum { + SLLV, /* Shift count can exceed 31. */ + SRAV, /* Shift count can exceed 31. */ + SRLV, /* Shift count can exceed 31. */ + SLLV31, /* Shift count is anded with 31. */ + SRAV31, /* Shift count is anded with 31. */ + SRLV31 /* Shift count is anded with 31. */ +} SHIFT_TYPE; + +void NOINLINE emulate_shift(void *presult, void *pv_old, int mask, void *psrc, + void *pcounts, SHIFT_TYPE type) { + V512 *result = (V512 *)presult; + V512 *v_old = (V512 *)pv_old; + V512 *src = (V512 *)psrc; + V512 *counts = (V512 *)pcounts; + int i, imm; + + for (i = 0; i < 16; i++) { + if ((mask & (1 << i)) == 0) { + result->u32[i] = v_old->u32[i]; + } else { + imm = counts->u32[i]; + if (type == SLLV31 || type == SRAV31 || type == SRLV31) { + imm &= 0x1f; + } + + if ((unsigned int)imm > 31) { + if (type == SRAV && src->s32[i] < 0) { + result->u32[i] = -1; + } else { + result->u32[i] = 0; + } + } else if (type == SLLV || type == SLLV31) { + result->u32[i] = src->u32[i] << imm; + } else if (type == SRLV || type == SRLV31) { + result->u32[i] = src->u32[i] >> imm; + } else { /* (type == SRAV || type == SRAV31) */ + result->u32[i] = src->s32[i] >> imm; + } + } + } +} + +void NOINLINE do_shifts() { + int k; + + k = 0xffff; + i3 = _mm512_sllv_epi32(i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SLLV); + check_equal32(&i3, &i4, &i2, k, "_mm512_sllv_epi32"); + + k = 0x97d5; + i3 = _mm512_mask_sllv_epi32(i2, k, i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SLLV); + check_equal32(&i3, &i4, &i2, k, "_mm512_sllv_epi32"); + + k = 0xffff; + i3 = _mm512_srav_epi32(i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SRAV); + check_equal32(&i3, &i4, &i2, k, "_mm512_srav_epi32"); + + k = 0x97d5; + i3 = _mm512_mask_srav_epi32(i2, k, i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SRAV); + check_equal32(&i3, &i4, &i2, k, "_mm512_srav_epi32"); + + k = 0xffff; + i3 = _mm512_srlv_epi32(i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SRLV); + check_equal32(&i3, &i4, &i2, k, "_mm512_srlv_epi32"); + + k = 0x97d5; + i3 = _mm512_mask_srlv_epi32(i2, k, i2, i1); + emulate_shift(&i4, &i2, k, &i2, &i1, SRLV); + check_equal32(&i3, &i4, &i2, k, "_mm512_srlv_epi32"); +} + +int main() { + init(); + do_shifts(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/shift.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/shift.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/shift.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/shiftrot.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/shiftrot.c +++ SingleSource/UnitTests/Vector/AVX512F/shiftrot.c @@ -0,0 +1,822 @@ +/* + * Test shifts and rotates. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_[maskz_]sllv_epi*() + * _mm512_[maskz_]srav_epi*() + * _mm512_[maskz_]srlv_epi*() + * _mm512_[maskz_]slli_epi*() + * _mm512_[maskz_]srai_epi*() + * _mm512_[maskz_]srli_epi*() + * _mm512_[maskz_]rol_epi*() + * _mm512_[maskz_]ror_epi*() + * _mm512_[maskz_]rolv_epi*() + * _mm512_[maskz_]rorv_epi*() + */ + +#include "m512_test_util.h" +#include +#include + +V512 counts16, counts32, counts64, src, passthru, counts32_imm, counts64_imm; +__mmask8 k8 = 0xf9; +__mmask16 k16 = 0x9ffe; + +volatile int vol0; + +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_update(v512) (v512).xmmi[vol0] = (v512).xmmi[vol0] +#define soft_src_update() soft_update(src) +#define soft_counts16_update() soft_update(counts16) +#define soft_counts32_update() soft_update(counts32) +#define soft_counts64_update() soft_update(counts64) + +#define IMMCNT 3 + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + counts32.s32[i] = ((i & 0x3) == 0) ? (-3 * i) : (3 * i); + counts32_imm.s32[i] = IMMCNT; + src.s32[i] = -27 * i * i + 300 * i - 82; + if (i & 0x1) { + src.s32[i] *= -1; + } + passthru.s32[i] = 48 * i * i + 100 * i - 9; + } + + for (i = 0; i < 8; i++) { + counts64.s64[i] = ((i & 0x3) == 0) ? (-9 * i) : (9 * i); + counts64_imm.s64[i] = IMMCNT; + } + + for (i = 0; i < 32; i++) { + counts16.s16[i] = ((i & 0x3) == 0) ? (3 - i) : i; + } +} + +typedef enum { OP_ROL, OP_ROR, OP_SLL, OP_SRA, OP_SRL } Operation; + +V512 dummy; + +#define emulate16_m128(oper, res, src1, src2) \ + emulate16((oper), (res), dummy, (__mmask32)-1, (src1), (src2), 8, 0) + +#define emulate16_mask_m128(oper, res, orig, k, src1, src2) \ + emulate16((oper), (res), (orig), (k), (src1), (src2), 8, 0) + +#define emulate16_maskz_m128(oper, res, k, src1, src2) \ + emulate16((oper), (res), dummy, (k), (src1), (src2), 8, 1) + +#define emulate16_m256(oper, res, src1, src2) \ + emulate16((oper), (res), dummy, (__mmask32)-1, (src1), (src2), 16, 0) + +#define emulate16_mask_m256(oper, res, orig, k, src1, src2) \ + emulate16((oper), (res), (orig), (k), (src1), (src2), 16, 0) + +#define emulate16_maskz_m256(oper, res, k, src1, src2) \ + emulate16((oper), (res), dummy, (k), (src1), (src2), 16, 1) + +#define emulate16_m512(oper, res, src1, src2) \ + emulate16((oper), (res), dummy, (__mmask32)-1, (src1), (src2), 32, 0) + +#define emulate16_mask_m512(oper, res, orig, k, src1, src2) \ + emulate16((oper), (res), (orig), (k), (src1), (src2), 32, 0) + +#define emulate16_maskz_m512(oper, res, k, src1, src2) \ + emulate16((oper), (res), dummy, (k), (src1), (src2), 32, 1) + +void NOINLINE emulate16(Operation oper, V512 *res, V512 orig, __mmask16 k, + V512 src1, V512 src2, int length, int zero_mask) { + int i; + short op1, op2, r; + + for (i = 0; i < length; i++) { + if (((1 << i) & k) == 0) { + r = zero_mask ? 0 : orig.s16[i]; + } else { + op1 = src1.s16[i]; + op2 = src2.s16[i]; + + switch (oper) { + case OP_ROL: + op2 &= 0xf; + r = (op1 << op2); + if (op2 != 0) { + r |= ((unsigned short)op1 >> (16 - op2)); + } + break; + case OP_ROR: + op2 &= 0xf; + r = ((unsigned short)op1 >> op2); + if (op2 != 0) { + r |= (op1 << (16 - op2)); + } + break; + case OP_SLL: + r = (op2 & ~0xf) ? 0 : (op1 << op2); + break; + case OP_SRA: + r = (op2 & ~0xf) ? (op1 < 0 ? -1 : 0) : (op1 >> op2); + break; + case OP_SRL: + r = (op2 & ~0xf) ? 0 : ((unsigned short)op1 >> op2); + break; + default: + printf("ERROR -- unexpected 16-bit operation\n"); + n_errs++; + return; + } + } + + res->s16[i] = r; + } +} + +#define emulate32_m128(oper, res, src1, src2) \ + emulate32((oper), (res), dummy, (__mmask16)-1, (src1), (src2), 4, 0) + +#define emulate32_mask_m128(oper, res, orig, k, src1, src2) \ + emulate32((oper), (res), (orig), (k), (src1), (src2), 4, 0) + +#define emulate32_maskz_m128(oper, res, k, src1, src2) \ + emulate32((oper), (res), dummy, (k), (src1), (src2), 4, 1) + +#define emulate32_m256(oper, res, src1, src2) \ + emulate32((oper), (res), dummy, (__mmask16)-1, (src1), (src2), 8, 0) + +#define emulate32_mask_m256(oper, res, orig, k, src1, src2) \ + emulate32((oper), (res), (orig), (k), (src1), (src2), 8, 0) + +#define emulate32_maskz_m256(oper, res, k, src1, src2) \ + emulate32((oper), (res), dummy, (k), (src1), (src2), 8, 1) + +#define emulate32_m512(oper, res, src1, src2) \ + emulate32((oper), (res), dummy, (__mmask16)-1, (src1), (src2), 16, 0) + +#define emulate32_mask_m512(oper, res, orig, k, src1, src2) \ + emulate32((oper), (res), (orig), (k), (src1), (src2), 16, 0) + +#define emulate32_maskz_m512(oper, res, k, src1, src2) \ + emulate32((oper), (res), dummy, (k), (src1), (src2), 16, 1) + +#define emulate32_m256(oper, res, src1, src2) \ + emulate32((oper), (res), dummy, (__mmask16)-1, (src1), (src2), 8, 0) + +#define emulate32_mask_m256(oper, res, orig, k, src1, src2) \ + emulate32((oper), (res), (orig), (k), (src1), (src2), 8, 0) + +#define emulate32_maskz_m256(oper, res, k, src1, src2) \ + emulate32((oper), (res), dummy, (k), (src1), (src2), 8, 1) + +#define emulate32_m128(oper, res, src1, src2) \ + emulate32((oper), (res), dummy, (__mmask16)-1, (src1), (src2), 4, 0) + +#define emulate32_mask_m128(oper, res, orig, k, src1, src2) \ + emulate32((oper), (res), (orig), (k), (src1), (src2), 4, 0) + +#define emulate32_maskz_m128(oper, res, k, src1, src2) \ + emulate32((oper), (res), dummy, (k), (src1), (src2), 4, 1) + +void NOINLINE emulate32(Operation oper, V512 *res, V512 orig, __mmask16 k, + V512 src1, V512 src2, int length, int zero_mask) { + int i, op1, op2, r; + + for (i = 0; i < length; i++) { + if (((1 << i) & k) == 0) { + r = zero_mask ? 0 : orig.s32[i]; + } else { + op1 = src1.s32[i]; + op2 = src2.s32[i]; + + switch (oper) { + case OP_ROL: + op2 &= 0x1f; + r = (op1 << op2); + if (op2 != 0) { + r |= ((unsigned int)op1 >> (32 - op2)); + } + break; + case OP_ROR: + op2 &= 0x1f; + r = ((unsigned int)op1 >> op2); + if (op2 != 0) { + r |= (op1 << (32 - op2)); + } + break; + case OP_SLL: + r = (op2 & ~0x1f) ? 0 : (op1 << op2); + break; + case OP_SRA: + r = (op2 & ~0x1f) ? (op1 < 0 ? -1 : 0) : (op1 >> op2); + break; + case OP_SRL: + r = (op2 & ~0x1f) ? 0 : ((unsigned int)op1 >> op2); + break; + default: + printf("ERROR -- unexpected 32-bit operation\n"); + n_errs++; + return; + } + } + + res->s32[i] = r; + } +} + +#define emulate64_m128(oper, res, src1, src2) \ + emulate64((oper), (res), dummy, (__mmask8)-1, (src1), (src2), 2, 0) + +#define emulate64_mask_m128(oper, res, orig, k, src1, src2) \ + emulate64((oper), (res), (orig), (k), (src1), (src2), 2, 0) + +#define emulate64_maskz_m128(oper, res, k, src1, src2) \ + emulate64((oper), (res), dummy, (k), (src1), (src2), 2, 1) + +#define emulate64_m256(oper, res, src1, src2) \ + emulate64((oper), (res), dummy, (__mmask8)-1, (src1), (src2), 4, 0) + +#define emulate64_mask_m256(oper, res, orig, k, src1, src2) \ + emulate64((oper), (res), (orig), (k), (src1), (src2), 4, 0) + +#define emulate64_maskz_m256(oper, res, k, src1, src2) \ + emulate64((oper), (res), dummy, (k), (src1), (src2), 4, 1) + +#define emulate64_m512(oper, res, src1, src2) \ + emulate64((oper), (res), dummy, (__mmask8)-1, (src1), (src2), 8, 0) + +#define emulate64_mask_m512(oper, res, orig, k, src1, src2) \ + emulate64((oper), (res), (orig), (k), (src1), (src2), 8, 0) + +#define emulate64_maskz_m512(oper, res, k, src1, src2) \ + emulate64((oper), (res), dummy, (k), (src1), (src2), 8, 1) + +#define emulate64_m256(oper, res, src1, src2) \ + emulate64((oper), (res), dummy, (__mmask8)-1, (src1), (src2), 4, 0) + +#define emulate64_mask_m256(oper, res, orig, k, src1, src2) \ + emulate64((oper), (res), (orig), (k), (src1), (src2), 4, 0) + +#define emulate64_maskz_m256(oper, res, k, src1, src2) \ + emulate64((oper), (res), dummy, (k), (src1), (src2), 4, 1) + +#define emulate64_m128(oper, res, src1, src2) \ + emulate64((oper), (res), dummy, (__mmask8)-1, (src1), (src2), 2, 0) + +#define emulate64_mask_m128(oper, res, orig, k, src1, src2) \ + emulate64((oper), (res), (orig), (k), (src1), (src2), 2, 0) + +#define emulate64_maskz_m128(oper, res, k, src1, src2) \ + emulate64((oper), (res), dummy, (k), (src1), (src2), 2, 1) + +void NOINLINE emulate64(Operation oper, V512 *res, V512 orig, __mmask8 k, + V512 src1, V512 src2, int length, int zero_mask) { + int i; + I64 op1, op2, r; + + for (i = 0; i < length; i++) { + if (((1 << i) & k) == 0) { + r = zero_mask ? 0 : orig.s64[i]; + } else { + op1 = src1.s64[i]; + op2 = src2.s64[i]; + + switch (oper) { + case OP_ROL: + op2 &= 0x3f; + r = (op1 << op2); + if (op2 != 0) { + r |= ((U64)op1 >> (64 - op2)); + } + break; + case OP_ROR: + op2 &= 0x3f; + r = ((U64)op1 >> op2); + if (op2 != 0) { + r |= (op1 << (64 - op2)); + } + break; + case OP_SLL: + r = (op2 & ~0x3f) ? 0 : (op1 << op2); + break; + case OP_SRA: + r = (op2 & ~0x3f) ? (op1 < 0 ? -1 : 0) : (op1 >> op2); + break; + case OP_SRL: + r = (op2 & ~0x3f) ? 0 : ((U64)op1 >> op2); + break; + default: + printf("ERROR -- unexpected 64-bit operation\n"); + n_errs++; + return; + } + } + + res->s64[i] = r; + } +} + +void NOINLINE do_shiftv32() { + V512 res; + V512 expected; + + /* sll 32 */ + + soft_counts32_update(); + res.zmmi = _mm512_sllv_epi32(src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_m512(OP_SLL, &expected, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_sllv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = + _mm512_mask_sllv_epi32(passthru.zmmi, k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_mask_m512(OP_SLL, &expected, passthru, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_mask_sllv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_sllv_epi32(k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_maskz_m512(OP_SLL, &expected, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sllv_epi32", __LINE__); + + /* sra 32 */ + + soft_counts32_update(); + res.zmmi = _mm512_srav_epi32(src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_m512(OP_SRA, &expected, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_srav_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = + _mm512_mask_srav_epi32(passthru.zmmi, k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_mask_m512(OP_SRA, &expected, passthru, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_mask_srav_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_srav_epi32(k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_maskz_m512(OP_SRA, &expected, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srav_epi32", __LINE__); + + /* srl 32 */ + + soft_counts32_update(); + res.zmmi = _mm512_srlv_epi32(src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_m512(OP_SRL, &expected, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_srlv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = + _mm512_mask_srlv_epi32(passthru.zmmi, k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_mask_m512(OP_SRL, &expected, passthru, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_mask_srlv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_srlv_epi32(k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_maskz_m512(OP_SRL, &expected, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srlv_epi32", __LINE__); +} + +void NOINLINE do_shiftv64() { + V512 res; + V512 expected; + + /* sll 64 */ + + soft_counts64_update(); + res.zmmi = _mm512_sllv_epi64(src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_m512(OP_SLL, &expected, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_sllv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_sllv_epi64(passthru.zmmi, k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_mask_m512(OP_SLL, &expected, passthru, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_mask_sllv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_sllv_epi64(k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_maskz_m512(OP_SLL, &expected, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sllv_epi64", __LINE__); + + /* sra 64 */ + + soft_counts64_update(); + res.zmmi = _mm512_srav_epi64(src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_m512(OP_SRA, &expected, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_srav_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_srav_epi64(passthru.zmmi, k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_mask_m512(OP_SRA, &expected, passthru, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_mask_srav_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_srav_epi64(k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_maskz_m512(OP_SRA, &expected, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srav_epi64", __LINE__); + + /* srl 64 */ + + soft_counts64_update(); + res.zmmi = _mm512_srlv_epi64(src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_m512(OP_SRL, &expected, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_srlv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_srlv_epi64(passthru.zmmi, k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_mask_m512(OP_SRL, &expected, passthru, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_mask_srlv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_srlv_epi64(k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_maskz_m512(OP_SRL, &expected, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srlv_epi64", __LINE__); +} + +void NOINLINE do_rotate32() { + V512 res; + V512 expected; + + /* rotate left 32 */ + + soft_counts32_update(); + res.zmmi = _mm512_rolv_epi32(src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_m512(OP_ROL, &expected, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_rolv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = + _mm512_mask_rolv_epi32(passthru.zmmi, k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_mask_m512(OP_ROL, &expected, passthru, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rolv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_rolv_epi32(k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_maskz_m512(OP_ROL, &expected, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rolv_epi32", __LINE__); + + /* rotate left 32 imm form 512 */ + + soft_counts32_update(); + res.zmmi = _mm512_rol_epi32(src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_m512(OP_ROL, &expected, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_rol_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_mask_rol_epi32(passthru.zmmi, k16, src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_mask_m512(OP_ROL, &expected, passthru, k16, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rol_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_rol_epi32(k16, src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_maskz_m512(OP_ROL, &expected, k16, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rol_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_rorv_epi32(src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_m512(OP_ROR, &expected, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_rorv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = + _mm512_mask_rorv_epi32(passthru.zmmi, k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_mask_m512(OP_ROR, &expected, passthru, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rorv_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_rorv_epi32(k16, src.zmmi, counts32.zmmi); + soft_counts32_update(); + emulate32_maskz_m512(OP_ROR, &expected, k16, src, counts32); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rorv_epi32", __LINE__); + + /* rotate right 32 imm form */ + soft_counts32_update(); + res.zmmi = _mm512_ror_epi32(src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_m512(OP_ROR, &expected, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_ror_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_mask_ror_epi32(passthru.zmmi, k16, src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_mask_m512(OP_ROR, &expected, passthru, k16, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_ror_epi32", __LINE__); + + soft_counts32_update(); + res.zmmi = _mm512_maskz_ror_epi32(k16, src.zmmi, IMMCNT); + soft_counts32_update(); + emulate32_maskz_m512(OP_ROR, &expected, k16, src, counts32_imm); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ror_epi32", __LINE__); +} + +void NOINLINE do_rotate64() { + V512 res; + V512 expected; + + /* rotate left 64 */ + + soft_counts64_update(); + res.zmmi = _mm512_rolv_epi64(src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_m512(OP_ROL, &expected, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_rolv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_rolv_epi64(passthru.zmmi, k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_mask_m512(OP_ROL, &expected, passthru, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rolv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_rolv_epi64(k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_maskz_m512(OP_ROL, &expected, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rolv_epi64", __LINE__); + + /* rotate left 64 imm form 512 */ + + soft_counts64_update(); + res.zmmi = _mm512_rol_epi64(src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_m512(OP_ROL, &expected, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_rol_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_rol_epi64(passthru.zmmi, k8, src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_mask_m512(OP_ROL, &expected, passthru, k8, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rol_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_rol_epi64(k8, src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_maskz_m512(OP_ROL, &expected, k8, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rol_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_rorv_epi64(src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_m512(OP_ROR, &expected, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_rorv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_rorv_epi64(passthru.zmmi, k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_mask_m512(OP_ROR, &expected, passthru, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_mask_rorv_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_rorv_epi64(k8, src.zmmi, counts64.zmmi); + soft_counts64_update(); + emulate64_maskz_m512(OP_ROR, &expected, k8, src, counts64); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_rorv_epi64", __LINE__); + + /* rotate right 64 imm form */ + soft_counts64_update(); + res.zmmi = _mm512_ror_epi64(src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_m512(OP_ROR, &expected, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_ror_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_mask_ror_epi64(passthru.zmmi, k8, src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_mask_m512(OP_ROR, &expected, passthru, k8, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_ror_epi64", __LINE__); + + soft_counts64_update(); + res.zmmi = _mm512_maskz_ror_epi64(k8, src.zmmi, IMMCNT); + soft_counts64_update(); + emulate64_maskz_m512(OP_ROR, &expected, k8, src, counts64_imm); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ror_epi64", __LINE__); +} + +void NOINLINE do_shifti32() { + V512 xres, zres; + V512 expected; + __mmask16 myk16 = 0x7f7f; + + /* slli, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_slli_epi32(src.zmmi, IMMCNT); + emulate32_m512(OP_SLL, &expected, src, counts32_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_slli_epi32", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_slli_epi32(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_slli_epi32 0", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_slli_epi32(src.zmmi, 33); + xres.zmmi = _mm512_setzero_epi32(); + check_equal_nd(&zres, &xres, 16, "_mm512_slli_epi32 33", __LINE__); + + /* slli, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_slli_epi32(passthru.zmmi, myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi32(passthru.zmmi, myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_slli_epi32", __LINE__); + + /* slli, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_slli_epi32(myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi32(myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_slli_epi32", __LINE__); + + /* srai, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_srai_epi32(src.zmmi, IMMCNT); + emulate32_m512(OP_SRA, &expected, src, counts32_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_srai_epi32", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srai_epi32(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_srai_epi32 0", __LINE__); + + /* srai, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_srai_epi32(passthru.zmmi, myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi32(passthru.zmmi, myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_srai_epi32", __LINE__); + + /* srai, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_srai_epi32(myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi32(myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_srai_epi32", __LINE__); + + /* srli, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_srli_epi32(src.zmmi, IMMCNT); + emulate32_m512(OP_SRL, &expected, src, counts32_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_srli_epi32", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srli_epi32(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_srli_epi32 0", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srli_epi32(src.zmmi, 33); + xres.zmmi = _mm512_setzero_epi32(); + check_equal_nd(&zres, &xres, 16, "_mm512_srli_epi32 33", __LINE__); + + /* srli, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_srli_epi32(passthru.zmmi, myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi32(passthru.zmmi, myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_srli_epi32", __LINE__); + + /* srli, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_srli_epi32(myk16, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi32(myk16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_srli_epi32", __LINE__); +} + +void NOINLINE do_shifti64() { + V512 xres, zres; + V512 expected; + __mmask8 myk8 = 0xee; + + /* slli, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_slli_epi64(src.zmmi, IMMCNT); + emulate64_m512(OP_SLL, &expected, src, counts64_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_slli_epi64", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_slli_epi64(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_slli_epi64 0", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_slli_epi64(src.zmmi, 67); + xres.zmmi = _mm512_setzero_epi32(); + check_equal_nd(&zres, &xres, 16, "_mm512_slli_epi64 67", __LINE__); + + /* slli, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_slli_epi64(passthru.zmmi, myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi64(passthru.zmmi, myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_slli_epi64", __LINE__); + + /* slli, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_slli_epi64(myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi64(myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_slli_epi64", __LINE__); + + /* srai, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_srai_epi64(src.zmmi, IMMCNT); + emulate64_m512(OP_SRA, &expected, src, counts64_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_srai_epi64", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srai_epi64(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_srai_epi64 0", __LINE__); + + /* srai, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_srai_epi64(passthru.zmmi, myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi64(passthru.zmmi, myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_srai_epi64", __LINE__); + + /* srai, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_srai_epi64(myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi64(myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_srai_epi64", __LINE__); + + /* srli, non-masked */ + + soft_update(src); + zres.zmmi = _mm512_srli_epi64(src.zmmi, IMMCNT); + emulate64_m512(OP_SRL, &expected, src, counts64_imm); + check_equal_nd(&zres, &expected, 16, "_mm512_srli_epi64", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srli_epi64(src.zmmi, 0); + check_equal_nd(&zres, &src, 16, "_mm512_srli_epi64 0", __LINE__); + + soft_update(src); + zres.zmmi = _mm512_srli_epi64(src.zmmi, 67); + xres.zmmi = _mm512_setzero_epi32(); + check_equal_nd(&zres, &xres, 16, "_mm512_srli_epi64 67", __LINE__); + + /* srli, masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_mask_srli_epi64(passthru.zmmi, myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_mask_mov_epi64(passthru.zmmi, myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_srli_epi64", __LINE__); + + /* srli, zero-masked. Reuses "expected" from above. */ + + soft_update(src); + zres.zmmi = _mm512_maskz_srli_epi64(myk8, src.zmmi, IMMCNT); + expected.zmmi = _mm512_maskz_mov_epi64(myk8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_srli_epi64", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_shifti32(); + do_shifti64(); + do_shiftv32(); + do_shiftv64(); + + do_rotate32(); + do_rotate64(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/shiftrot.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/shiftrot.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/shiftrot.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/swizzle.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/swizzle.c +++ SingleSource/UnitTests/Vector/AVX512F/swizzle.c @@ -0,0 +1,183 @@ + +/* + * Exercise some swizzles, upconverts and downconverts. + * This test was created to check correctness + * of the following intrinsics support: + * vmovdqa32() + * vmovdqa64() + */ + +#include "m512_test_util.h" +#include +#include +#include + +#if defined(__x86_64) || defined(_M_X64) +/* + * Exercise encoding zmm registers above zmm15, including one that + * has the fifth bit set and one that doesn't. + */ +#define bigzmm zmm28 +#define mediumzmm zmm20 +#else +#define bigzmm zmm7 +#define mediumzmm zmm3 +#endif + +volatile int vi[16] = {0x12345678, 0x87654321, 0x05040302, 0x8594a3b2, + 0x92745638, 0xa0b0c0d0, 0xd0b0a040, 0x14322341, + 0xf24bee68, 0x3ed29ff6, 0xa2d46e46, 0x02119d99, + 0x1289a683, 0x0c4563de, 0x3edfd4a4, 0x49d52d48}; + +V512 src, dst1, dst2, t0, t1; + +static void NOINLINE init() { + int i; + + for (i = 0; i < 16; i++) { + src.s32[i] = vi[i]; + } + + memset(&dst1, 0, sizeof(dst1)); + memset(&dst2, 0, sizeof(dst2)); +} + +void NOINLINE do_mov_32() { + int i, v; + void *psrc = &src; + void *pdst = &dst1; + + init(); + + __asm { + mov FULL_IREG(cx), [psrc] + mov FULL_IREG(dx), [pdst] + vmovdqa32 bigzmm, [FULL_IREG(cx)] + vmovdqa32 mediumzmm, bigzmm + vmovdqu32 [FULL_IREG(dx)], mediumzmm + } + + for (i = 0; i < 16; i++) { + v = src.s32[i]; + dst2.s32[i] = v; + } + check_equal_nd(&dst1, &dst2, 16, "vmovdqa32", __LINE__); +} + +void NOINLINE do_mov_32_masked() { + int i, k; + void *psrc = &src; + void *pdst = &dst1; + + k = 0xaaaa; /* every other bit is set */ + + init(); + + __asm { + mov eax, k + kmovw k6, eax + mov FULL_IREG(cx), [psrc] + mov FULL_IREG(dx), [pdst] + vzeroall + vmovdqa32 bigzmm{k6}, [FULL_IREG(cx)] + vmovdqa32 mediumzmm { k6 }, bigzmm + vmovdqu32[FULL_IREG(dx)]{k6}, mediumzmm + } + + memset(&t0, 0, sizeof(t0)); + + /* Emulate vmovdqa32 bigzmm{k6}, [src] */ + + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + dst2.s32[i] = src.u32[i]; + } + } + check_equal_nd(&dst1, &dst2, 16, "vmovdqa32 masked", __LINE__); +} + +void NOINLINE do_mov_64() { + int i; + void *psrc = &src; + void *pdst = &dst1; + + init(); + + __asm { + mov FULL_IREG(cx), [psrc] + mov FULL_IREG(dx), [pdst] + vmovdqa64 bigzmm, [FULL_IREG(cx)] + vmovdqa64 mediumzmm, bigzmm + vmovdqu64 [FULL_IREG(dx)], mediumzmm + } + + for (i = 0; i < 8; i++) { + dst2.u64[i] = src.u64[i]; + } + check_equal_nq(&dst1, &dst2, 8, "vmovdqa64", __LINE__); +} + +void NOINLINE do_mov_64_masked() { + int i, k; + void *psrc = &src; + void *pdst = &dst1; + + k = 0xaa; /* every other bit is set */ + + init(); + + __asm { + mov eax, k + kmovw k5, eax + mov FULL_IREG(cx), [psrc] + mov FULL_IREG(dx), [pdst] + vzeroall + vmovdqa64 bigzmm{k5}, [FULL_IREG(cx)] + vmovdqa64 mediumzmm { k5 }, bigzmm + vmovdqu64[FULL_IREG(dx)]{k5}, mediumzmm + } + + memset(&t0, 0, sizeof(t0)); + + /* emulate vmovdqa64 bigzmm{k5}, [src] */ + + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + t0.u64[i] = src.u64[i]; + } + } + + /* emulate vmovdqa64 mediumzmm{k5}, bigzmm */ + + for (i = 0; i < 8; i++) { + t1.u64[i] = t0.u64[i]; + } + + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + t0.u64[i] = t1.u64[i]; + } + } + + /* emulate vmovdqu64 [dst1]{k5}, mediumzmm */ + + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + dst2.u64[i] = t0.u64[i]; + } + } + check_equal_nq(&dst1, &dst2, 8, "vmovdqa64 masked", __LINE__); +} + +int main() { + do_mov_32(); + do_mov_32_masked(); + do_mov_64(); + do_mov_64_masked(); + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/swizzle.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/swizzle.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/swizzle.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/t_getexp.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/t_getexp.c +++ SingleSource/UnitTests/Vector/AVX512F/t_getexp.c @@ -0,0 +1,340 @@ +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm_getexp_round_*() + * _mm_mask_getexp_round_*() + * _mm_maskz_getexp_round_*() + * _mm512_getexp_*() + * _mm512_getexp_round_*() + * _mm512_mask_getexp_*() + * _mm512_mask_getexp_round_*() + * _mm512_maskz_getexp_*() + * _mm512_maskz_getexp_round_*() + */ + +#include "m512_test_util.h" +#include +#include +#include +#include + +int show_op = 0; + +static int NOINLINE check_ps(float val1[], float good[], int mask, int num_elem, + int zmask) { + int i; + int res = 1; + + for (i = 0; i < num_elem; i += 1) { + if ((1 << i) & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], good[i]); + } + } else if (zmask == 1) { + if (val1[i] != 0) { + res = 0; + printf("FAIL(%d): %f != 0\n", i, val1[i]); + } + } + } + return (res); +} + +static int NOINLINE check_ss(float val1[], float good[], float op2[], int mask, + int num_elem, int zmask) { + int i = 0; + int res = 1; + + // check first element + if (0x1 & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], good[i]); + } + } else if (zmask == 1) { + if (val1[i] != 0) { + res = 0; + printf("FAIL(%d): %f != 0\n", i, val1[i]); + } + } + + // check other elements + for (i = 1; i < num_elem; i += 1) { + if (val1[i] != op2[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], op2[i]); + } + } + return (res); +} + +static void NOINLINE print_f32_vec(char *pfx, float ivec[], int short_form) { + if (pfx) { + printf("%s: ", pfx); + } + if (!short_form) { + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[15], ivec[14], ivec[13], + ivec[12]); + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[11], ivec[10], ivec[9], + ivec[8]); + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]); + } + printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]); +} + +static void NOINLINE init_exp_f32(float ivalout[16], float ivalexp[16], + float source[16]) { + int i; + float expected[] = {10.0, 7.0, 24.0, 5.0, 9.0, 2.0, 22.0, 6.0, + 0.0, 21.0, 1.0, 18.0, 2.0, 22.0, 2.0, 16.0}; + +#pragma clang loop vectorize(disable) + for (i = 0; i < 16; i += 1) { + ivalout[i] = source[i]; + ivalexp[i] = floorf(log2(fabs((source[i])))); + ivalexp[i] = expected[i]; + } +} + +static int NOINLINE check_pd(double val1[], double good[], int mask, + int num_elem, int zmask) { + int i; + int res = 1; + + for (i = 0; i < num_elem; i += 1) { + if ((1 << i) & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], good[i]); + } + } else if (zmask == 1) { + if (val1[i] != 0) { + res = 0; + printf("FAIL(%d): %f != 0\n", i, val1[i]); + } + } + } + return (res); +} + +static int NOINLINE check_sd(double val1[], double good[], double op2[], + int mask, int num_elem, int zmask) { + int i = 0; + int res = 1; + + // check first element + if (0x1 & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], good[i]); + } + } else if (zmask == 1) { + if (val1[i] != 0) { + res = 0; + printf("FAIL(%d): %f != 0\n", i, val1[i]); + } + } + + // check other elements + for (i = 1; i < num_elem; i += 1) { + if (val1[i] != op2[i]) { + res = 0; + printf("FAIL(%d): %f != %f\n", i, val1[i], op2[i]); + } + } + return (res); +} + +static void NOINLINE print_f64_vec(char *pfx, double ivec[], int short_form) { + if (pfx) { + printf("%s: ", pfx); + } + if (!short_form) { + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]); + } + printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]); +} + +static void NOINLINE init_exp_f64(double ivalout[8], double ivalexp[8], + double source[8]) { + int i; + double expected[] = {10.0, 7.0, 24.0, 5.0, 9.0, 2.0, 22.0, 19.0}; + +#pragma clang loop vectorize(disable) + for (i = 0; i < 8; i += 1) { + ivalout[i] = source[i]; + + // MS does not support log2() therefore we will use pre-calculated values. + // ivalexp[i] = floor(log2(fabs((source[i])))); + ivalexp[i] = expected[i]; + } +} + +#define CHECK_RESULT_PS(FUNC, RES, GOOD, OP, MMASK, NUMB, ZEROM) \ + { \ + int passed = 0; \ + passed = check_ps(RES.f32, GOOD.f32, MMASK, NUMB, ZEROM); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_f32_vec("Opand1", OP.f32, 0); \ + print_f32_vec("Scalar", GOOD.f32, 0); \ + print_f32_vec("Vector", RES.f32, 0); \ + } \ + } + +#define CHECK_RESULT_SS(FUNC, RES, GOOD, OP1, OP2, MMASK, NUMB, ZEROM) \ + { \ + int passed = 0; \ + passed = check_ss(RES.f32, GOOD.f32, OP2.f32, MMASK, NUMB, ZEROM); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_f32_vec("Opand1", OP1.f32, 1); \ + print_f32_vec("Opand2", OP2.f32, 1); \ + print_f32_vec("Scalar", GOOD.f32, 1); \ + print_f32_vec("Vector", RES.f32, 1); \ + } \ + } + +#define CHECK_RESULT_PD(FUNC, RES, GOOD, OP, MMASK, NUMB, ZEROM) \ + { \ + int passed = 0; \ + passed = check_pd(RES.f64, GOOD.f64, MMASK, NUMB, ZEROM); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_f64_vec("Opand1", OP.f64, 0); \ + print_f64_vec("Scalar", GOOD.f64, 0); \ + print_f64_vec("Vector", RES.f64, 0); \ + } \ + } + +#define CHECK_RESULT_SD(FUNC, RES, GOOD, OP1, OP2, MMASK, NUMB, ZEROM) \ + { \ + int passed = 0; \ + passed = check_sd(RES.f64, GOOD.f64, OP2.f64, MMASK, NUMB, ZEROM); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_f64_vec("Opand1", OP1.f64, 1); \ + print_f64_vec("Opand2", OP2.f64, 1); \ + print_f64_vec("Scalar", GOOD.f64, 1); \ + print_f64_vec("Vector", RES.f64, 1); \ + } \ + } + +static void NOINLINE getexp_float() { + V512 v0, v1, v2, v3, v4; + + float init[] = {1111.11, -222.22, 33333333.33, -44.44, + 555.55, -6.66, 7777777.77, -86.88, + -1.11, 2222222.22, -3.33, 444444.44, + -5.55, 6666666.66, -7.77, 88888.88}; + + init_exp_f32(v1.f32, v2.f32, init); + v4.zmm = _mm512_set1_ps(1.0); + v0.zmm = _mm512_setzero_ps(); + + v3.zmm = _mm512_getexp_ps(v1.zmm); + CHECK_RESULT_PS(_mm512_getexp_ps, v3, v2, v1, 0xffff, 16, 0); + + v3.zmm = _mm512_mask_getexp_ps(v0.zmm, 0x1ff8, v1.zmm); + CHECK_RESULT_PS(_mm512_mask_getexp_ps, v3, v2, v1, 0x1ff8, 16, 0); + + v3.zmm = _mm512_maskz_getexp_ps(0xf18f, v1.zmm); + CHECK_RESULT_PS(_mm512_maskz_getexp_ps, v3, v2, v1, 0xf18f, 16, 1); + + v3.zmm = _mm512_getexp_round_ps(v1.zmm, _MM_FROUND_TO_NEAREST_INT | + _MM_FROUND_NO_EXC); + CHECK_RESULT_PS(_mm512_getexp_round_ps, v3, v2, v1, 0xffff, 16, 0); + + v3.zmm = _mm512_mask_getexp_round_ps( + v0.zmm, 0x1ff8, v1.zmm, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_PS(_mm512_mask_getexp_round_ps, v3, v2, v1, 0x1ff8, 16, 0); + + v3.zmm = _mm512_maskz_getexp_round_ps( + 0xf18f, v1.zmm, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_PS(_mm512_maskz_getexp_round_ps, v3, v2, v1, 0xf18f, 16, 1); + + v3.xmm[0] = _mm_getexp_round_ss( + v4.xmm[0], v1.xmm[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SS(_mm_getexp_round_ss, v3, v2, v1, v4, 0x1, 4, 0); + + v3.xmm[0] = + _mm_mask_getexp_round_ss(v0.xmm[0], 0x1, v4.xmm[0], v1.xmm[0], + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SS(_mm_mask_getexp_round_ss, v3, v2, v1, v4, 0x1, 4, 0); + + v3.xmm[0] = _mm_maskz_getexp_round_ss( + 0x0, v4.xmm[0], v1.xmm[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SS(_mm_maskz_getexp_round_ss, v3, v2, v1, v4, 0x0, 4, 1); +} + +static void NOINLINE getexp_double() { + V512 v0, v1, v2, v3, v4; + + double init[] = {1111.11, -222.22, 33333333.33, -44.44, + 555.55, -6.66, 7777777.77, -888888.88}; + + init_exp_f64(v1.f64, v2.f64, init); + v4.zmmd = _mm512_set1_pd(1.0); + v0.zmmd = _mm512_setzero_pd(); + + v3.zmmd = _mm512_getexp_pd(v1.zmmd); + CHECK_RESULT_PD(_mm512_getexp_pd, v3, v2, v1, 0xffff, 8, 0); + + v3.zmmd = _mm512_mask_getexp_pd(v0.zmmd, 0xf8, v1.zmmd); + CHECK_RESULT_PD(_mm512_mask_getexp_pd, v3, v2, v1, 0xf8, 8, 0); + + v3.zmmd = _mm512_maskz_getexp_pd(0x8f, v1.zmmd); + CHECK_RESULT_PD(_mm512_maskz_getexp_pd, v3, v2, v1, 0x8f, 8, 1); + + v3.zmmd = _mm512_getexp_round_pd(v1.zmmd, _MM_FROUND_TO_NEAREST_INT | + _MM_FROUND_NO_EXC); + CHECK_RESULT_PD(_mm512_getexp_round_pd, v3, v2, v1, 0xffff, 8, 0); + + v3.zmmd = _mm512_mask_getexp_round_pd( + v0.zmmd, 0x1f, v1.zmmd, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_PD(_mm512_mask_getexp_round_pd, v3, v2, v1, 0x1f, 8, 0); + + v3.zmmd = _mm512_maskz_getexp_round_pd( + 0xf1, v1.zmmd, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_PD(_mm512_maskz_getexp_round_pd, v3, v2, v1, 0xf1, 8, 1); + + v3.xmmd[0] = _mm_getexp_round_sd( + v4.xmmd[0], v1.xmmd[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SD(_mm_getexp_round_sd, v3, v2, v1, v4, 0x1, 2, 0); + + v3.xmmd[0] = + _mm_mask_getexp_round_sd(v0.xmmd[0], 0x1, v4.xmmd[0], v1.xmmd[0], + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SD(_mm_mask_getexp_round_sd, v3, v2, v1, v4, 0x1, 2, 0); + + v3.xmmd[0] = + _mm_maskz_getexp_round_sd(0x0, v4.xmmd[0], v1.xmmd[0], + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + CHECK_RESULT_SD(_mm_maskz_getexp_round_sd, v3, v2, v1, v4, 0x0, 2, 1); +} + +int main() { + getexp_float(); + getexp_double(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/t_getexp.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/t_getexp.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/t_getexp.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/t_movzext.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/t_movzext.c +++ SingleSource/UnitTests/Vector/AVX512F/t_movzext.c @@ -0,0 +1,94 @@ +/* + * Test MOVZEXT family intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm256_zextps128_ps256() + * _mm256_zextpd128_pd256() + * _mm256_zextsi128_si256() + * _mm512_zextps128_ps512() + * _mm512_zextpd128_pd512() + * _mm512_zextsi128_si512() + * _mm512_zextps256_ps512() + * _mm512_zextpd256_pd512() + * _mm512_zextsi256_si512() + */ + +#include "m512_test_util.h" +#include +#include + +V512 src1; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + src1.s32[i] = (i + 1) * 1111111; + } +} + +void NOINLINE emulate_zext(V512 *res, V512 src1, int length) { + int j; + + for (j = 0; j < 16; j++) { + res->s32[j] = 0; + } + + for (j = 0; j < length; j++) { + res->s32[j] = src1.s32[j]; + } +} + +void NOINLINE do_zext() { + V512 res; + V512 expected; + + emulate_zext(&expected, src1, 4); + res.ymm[0] = _mm256_zextps128_ps256(src1.xmm[0]); + check_equal_nd(&res, &expected, 8, "_mm256_zextps128_ps256", __LINE__); + + emulate_zext(&expected, src1, 4); + res.ymmd[0] = _mm256_zextpd128_pd256(src1.xmmd[0]); + check_equal_nd(&res, &expected, 8, "_mm256_zextpd128_pd256", __LINE__); + + emulate_zext(&expected, src1, 4); + res.ymmi[0] = _mm256_zextsi128_si256(src1.xmmi[0]); + check_equal_nd(&res, &expected, 8, "_mm256_zextsi128_si256", __LINE__); + + emulate_zext(&expected, src1, 4); + res.zmm = _mm512_zextps128_ps512(src1.xmm[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextps128_ps512", __LINE__); + + emulate_zext(&expected, src1, 4); + res.zmmd = _mm512_zextpd128_pd512(src1.xmmd[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextpd128_pd512", __LINE__); + + emulate_zext(&expected, src1, 4); + res.zmmi = _mm512_zextsi128_si512(src1.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextsi128_si512", __LINE__); + + emulate_zext(&expected, src1, 8); + res.zmm = _mm512_zextps256_ps512(src1.ymm[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextps256_ps512", __LINE__); + + emulate_zext(&expected, src1, 8); + res.zmmd = _mm512_zextpd256_pd512(src1.ymmd[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextpd256_pd512", __LINE__); + + emulate_zext(&expected, src1, 8); + res.zmmi = _mm512_zextsi256_si512(src1.ymmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_zextsi256_si512", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_zext(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/t_movzext.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/t_movzext.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/t_movzext.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/undefined_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/undefined_m512.c +++ SingleSource/UnitTests/Vector/AVX512F/undefined_m512.c @@ -0,0 +1,39 @@ +#include "m512_test_util.h" +#include + +/* This test was created to check support + * of the following intrinsics: + * _mm512_undefined() + * _mm512_undefined_epi32() + * _mm512_undefined_pd() + * _mm512_undefined_ps() + */ + +__m512 NOINLINE do_undef() { + __m512 v1 = _mm512_undefined(); + __m512i v2 = _mm512_undefined_epi32(); + __m512d v3 = _mm512_undefined_pd(); + __m512 v4 = _mm512_undefined_ps(); + + return v4; +} + +int main(int argc, char *argv[]) { + /* + * These tests don't execute meaningfully, so don't call them. + * argc is typically 1 but the compiler doesn't known that. + * We're really just ensuring that they get succesfully compiled, + * And that, if executed by an sde, they are successfully decoded. + */ + if (argc > 10) { + do_undef(); + } + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/undefined_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/undefined_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/undefined_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.c +++ SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.c @@ -0,0 +1,533 @@ + +/* + * Test the unpack{hi,lo} and shuffle intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_mask_blend_*() + * _mm512_shuffle_*() + * _mm512_mask_shuffle_*() + * _mm_unpack*() + * _mm256_unpack*() + * _mm512_unpack*() + * _mm512_mask_unpack*() + * _mm512_maskz_unpack*() + */ + +#include "m512_test_util.h" +#include +#include + +volatile int vol0 = 0; +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src. + */ +#define soft_update(src) (src).xmmi[vol0] = (src).xmmi[vol0] + +V512 in8; +V512 in8_neg; +V512 in32; +V512 in32_neg; +V512 in32_mix; +V512 in64; +V512 in64_neg; +V512 in64_mix; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + in8.s8[i] = i; + in8_neg.s8[i] = -i; + } + + + for (i = 0; i < 16; i++) { + in32.s32[i] = i; + in32_neg.s32[i] = -i; + in32_mix.s32[i] = ((i % 3) == 0) ? -i : i; + } + + for (i = 0; i < 8; i++) { + in64.s64[i] = i; + in64_neg.s64[i] = -i; + in64_mix.s64[i] = ((i % 3) == 0) ? -i : i; + } +} + +#define CHECK_UNPCKHBW(n_lanes, dest, mask, zeroing, name) \ + { \ + volatile int i, j, lane; \ + for (lane = 0; lane < n_lanes; lane++) { \ + for (i = 0, j = 0; i < 16; i += 2, j++) { \ + expected.s8[16 * lane + i] = in8.s8[16 * lane + 8 + j]; \ + expected.s8[16 * lane + i + 1] = in8_neg.s8[16 * lane + 8 + j]; \ + } \ + } \ + for (i = 0; i < n_lanes * 16; i++) { \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s8[i] = 0; \ + } else { \ + expected.s8[i] = dest.s8[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \ + in8_neg.ymmi[vol0] = in8_neg.ymmi[vol0]; \ + } + +#define CHECK_UNPCKH32(n_lanes, dest, mask, zeroing, name) \ + { \ + volatile int i, j, lane; \ + for (lane = 0; lane < n_lanes; lane++) { \ + for (i = 0, j = 0; i < 4; i += 2, j++) { \ + expected.s32[4 * lane + i] = in32.s32[4 * lane + 2 + j]; \ + expected.s32[4 * lane + i + 1] = in32_neg.s32[4 * lane + 2 + j]; \ + } \ + } \ + for (i = 0; i < n_lanes * 4; i++) { \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s32[i] = 0; \ + } else { \ + expected.s32[i] = dest.s32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \ + in32_neg.ymmi[vol0] = in32_neg.ymmi[vol0]; \ + } + +#define CHECK_UNPCKH64(n_lanes, dest, mask, zeroing, name) \ + { \ + volatile int i, j, lane; \ + for (lane = 0; lane < n_lanes; lane++) { \ + for (i = 0, j = 0; i < 2; i += 2, j++) { \ + expected.s64[2 * lane + i] = in64.s64[2 * lane + 1 + j]; \ + expected.s64[2 * lane + i + 1] = in64_neg.s64[2 * lane + 1 + j]; \ + } \ + } \ + for (i = 0; i < n_lanes * 2; i++) { \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s64[i] = 0; \ + } else { \ + expected.s64[i] = dest.s64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \ + in64_neg.ymmi[vol0] = in64_neg.ymmi[vol0]; \ + } + +#define CHECK_UNPCKL32(n_lanes, dest, mask, zeroing, name) \ + { \ + volatile int i, j, lane; \ + for (lane = 0; lane < n_lanes; lane++) { \ + for (i = 0, j = 0; i < 4; i += 2, j++) { \ + expected.s32[4 * lane + i] = in32.s32[4 * lane + j]; \ + expected.s32[4 * lane + i + 1] = in32_neg.s32[4 * lane + j]; \ + } \ + } \ + for (i = 0; i < n_lanes * 4; i++) { \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s32[i] = 0; \ + } else { \ + expected.s32[i] = dest.s32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \ + in32_neg.ymmi[vol0] = in32_neg.ymmi[vol0]; \ + } + +#define CHECK_UNPCKL64(n_lanes, dest, mask, zeroing, name) \ + { \ + volatile int i, j, lane; \ + for (lane = 0; lane < n_lanes; lane++) { \ + for (i = 0, j = 0; i < 2; i += 2, j++) { \ + expected.s64[2 * lane + i] = in64.s64[2 * lane + j]; \ + expected.s64[2 * lane + i + 1] = in64_neg.s64[2 * lane + j]; \ + } \ + } \ + for (i = 0; i < n_lanes * 2; i++) { \ + if ((mask & (1LL << i)) == 0) { \ + if (zeroing) { \ + expected.s64[i] = 0; \ + } else { \ + expected.s64[i] = dest.s64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \ + in64_neg.ymmi[vol0] = in64_neg.ymmi[vol0]; \ + } + +void NOINLINE do_unpckps_lo() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.zmm = _mm512_unpacklo_ps(in32.zmm, in32_neg.zmm); + CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_unpacklo_ps"); + + k = 0xA4A4; + res.zmm = _mm512_mask_unpacklo_ps(in32_mix.zmm, k, in32.zmm, in32_neg.zmm); + CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_mask_unpacklo_ps"); + + res.zmm = _mm512_maskz_unpacklo_ps(k, in32.zmm, in32_neg.zmm); + CHECK_UNPCKL32(4, in32_mix, k, 1, "_mm512_maskz_unpacklo_ps"); +} + +void NOINLINE do_unpckps_hi() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.xmm[vol0] = _mm_unpackhi_ps(in32.xmm[vol0], in32_neg.xmm[vol0]); + CHECK_UNPCKH32(1, in32_mix, k, 0, "_mm_unpackhi_ps"); + res.ymm[vol0] = _mm256_unpackhi_ps(in32.ymm[vol0], in32_neg.ymm[vol0]); + CHECK_UNPCKH32(2, in32_mix, k, 0, "_mm256_unpackhi_ps"); + res.zmm = _mm512_unpackhi_ps(in32.zmm, in32_neg.zmm); + CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_unpackhi_ps"); + + k = 0xA4A4; + res.zmm = _mm512_mask_unpackhi_ps(in32_mix.zmm, k, in32.zmm, in32_neg.zmm); + CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_mask_unpackhi_ps"); + + res.zmm = _mm512_maskz_unpackhi_ps(k, in32.zmm, in32_neg.zmm); + CHECK_UNPCKH32(4, in32_mix, k, 1, "_mm512_maskz_unpackhi_ps"); +} + +void NOINLINE do_unpckdq_lo() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.xmmi[vol0] = _mm_unpacklo_epi32(in32.xmmi[vol0], in32_neg.xmmi[vol0]); + CHECK_UNPCKL32(1, in32_mix, k, 0, "_mm_unpacklo_epi32"); + res.ymmi[vol0] = _mm256_unpacklo_epi32(in32.ymmi[vol0], in32_neg.ymmi[vol0]); + CHECK_UNPCKL32(2, in32_mix, k, 0, "_mm256_unpacklo_epi32"); + res.zmmi = _mm512_unpacklo_epi32(in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_unpacklo_epi32"); + + k = 0xA4A4; + res.zmmi = _mm512_mask_unpacklo_epi32(in32_mix.zmmi, k, in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_mask_unpacklo_epi32"); + + res.zmmi = _mm512_maskz_unpacklo_epi32(k, in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKL32(4, in32_mix, k, 1, "_mm512_maskz_unpacklo_epi32"); +} + +void NOINLINE do_unpckqdq_lo() { + V512 res; + V512 expected; + __mmask8 k8 = 0xFF; + + res.zmmi = _mm512_unpacklo_epi64(in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_unpacklo_epi64"); + + k8 = 0x4A; + res.zmmi = + _mm512_mask_unpacklo_epi64(in64_mix.zmmi, k8, in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_mask_unpacklo_epi64"); + + res.zmmi = _mm512_maskz_unpacklo_epi64(k8, in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKL64(4, in64_mix, k8, 1, "_mm512_maskz_unpacklo_epi64"); +} + +void NOINLINE do_unpckpd_lo() { + V512 res; + V512 expected; + __mmask8 k8 = 0xFF; + + res.zmmd = _mm512_unpacklo_pd(in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_unpacklo_pd"); + + k8 = 0x4A; + res.zmmd = _mm512_mask_unpacklo_pd(in64_mix.zmmd, k8, in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_mask_unpacklo_pd"); + + res.zmmd = _mm512_maskz_unpacklo_pd(k8, in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKL64(4, in64_mix, k8, 1, "_mm512_maskz_unpacklo_pd"); +} + +#define CHECK_BLENDM(n_elems, width, type, mask, src1, src2, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; ++i) { \ + if ((mask & (1LL << i)) == 0) { \ + expected.type[i] = src1.type[i]; \ + } else { \ + expected.type[i] = src2.type[i]; \ + } \ + } \ + check_equal_nd(&res, &expected, (n_elems * width) / 4, name, __LINE__); \ + src2.ymmi[vol0] = src2.ymmi[vol0]; \ + } + +void NOINLINE do_blendmd() { + V512 res; + V512 expected; + __mmask16 k = 0xA44A; + + res.zmmi = _mm512_mask_blend_epi32(k, in32.zmmi, in32_neg.zmmi); + CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_blend_epi32"); + res.zmmi = _mm512_mask_mov_epi32(in32.zmmi, k, in32_neg.zmmi); + CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_mov_epi32"); +} + +void NOINLINE do_blendmq() { + V512 res; + V512 expected; + __mmask8 k = 0xA4; + + res.zmmi = _mm512_mask_blend_epi64(k, in64.zmmi, in64_neg.zmmi); + CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_epi64"); + res.zmmi = _mm512_mask_mov_epi64(in64.zmmi, k, in64_neg.zmmi); + CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_mov_epi64"); +} + +void NOINLINE do_unpckqdq_hi() { + V512 res; + V512 expected; + __mmask8 k8 = 0xFF; + + res.zmmi = _mm512_unpackhi_epi64(in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_unpackhi_epi64"); + + k8 = 0x4A; + res.zmmi = + _mm512_mask_unpackhi_epi64(in64_mix.zmmi, k8, in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_mask_unpackhi_epi64"); + + res.zmmi = _mm512_maskz_unpackhi_epi64(k8, in64.zmmi, in64_neg.zmmi); + CHECK_UNPCKH64(4, in64_mix, k8, 1, "_mm512_maskz_unpackhi_epi64"); +} + +void NOINLINE do_unpckpd_hi() { + V512 res; + V512 expected; + __mmask8 k8 = 0xFF; + + res.xmmd[vol0] = _mm_unpackhi_pd(in64.xmmd[vol0], in64_neg.xmmd[vol0]); + CHECK_UNPCKH64(1, in64_mix, k8, 0, "_mm_unpackhi_pd"); + res.ymmd[vol0] = _mm256_unpackhi_pd(in64.ymmd[vol0], in64_neg.ymmd[vol0]); + CHECK_UNPCKH64(2, in64_mix, k8, 0, "_mm256_unpackhi_pd"); + res.zmmd = _mm512_unpackhi_pd(in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_unpackhi_pd"); + + k8 = 0x4A; + res.zmmd = _mm512_mask_unpackhi_pd(in64_mix.zmmd, k8, in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_mask_unpackhi_pd"); + + res.zmmd = _mm512_maskz_unpackhi_pd(k8, in64.zmmd, in64_neg.zmmd); + CHECK_UNPCKH64(4, in64_mix, k8, 1, "_mm512_maskz_unpackhi_pd"); +} + +void NOINLINE do_shuf_ps() { + V512 res; + V512 expected; + volatile int i, lane; + __mmask16 k = 0x7e95; +#define PS_IMM 0x5c + + res.zmm = _mm512_shuffle_ps(in32.zmm, in32_neg.zmm, PS_IMM); + + for (lane = 0; lane < 4; lane++) { + for (i = 0; i < 1; i++) { + expected.s32[4 * lane + i] = in32.s32[4 * lane + (PS_IMM & 3)]; + expected.s32[4 * lane + i + 1] = in32.s32[4 * lane + ((PS_IMM >> 2) & 3)]; + expected.s32[4 * lane + 2 + i] = + in32_neg.s32[4 * lane + ((PS_IMM >> 4) & 3)]; + expected.s32[4 * lane + 2 + i + 1] = + in32_neg.s32[4 * lane + ((PS_IMM >> 6) & 3)]; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_shuffle_ps", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_shuffle_ps(res.zmm, k, in32.zmm, in32_neg.zmm, PS_IMM); + + expected.zmmi = _mm512_setzero_epi32(); + for (lane = 0; lane < 4; lane++) { + for (i = 0; i < 1; i++) { + int m = 4 * lane; + if ((1 << (m + i)) & k) { + expected.s32[m + i] = in32.s32[m + (PS_IMM & 3)]; + } + if ((1 << (m + i + 1)) & k) { + expected.s32[m + i + 1] = in32.s32[m + ((PS_IMM >> 2) & 3)]; + } + if ((1 << (m + 2 + i)) & k) { + expected.s32[m + 2 + i] = in32_neg.s32[m + ((PS_IMM >> 4) & 3)]; + } + if ((1 << (m + 2 + i + 1)) & k) { + expected.s32[m + 2 + i + 1] = in32_neg.s32[m + ((PS_IMM >> 6) & 3)]; + } + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_ps", __LINE__); +} + +void NOINLINE do_unpckdq_hi() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.zmmi = _mm512_unpackhi_epi32(in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_unpackhi_epi32"); + + k = 0xA4A4; + res.zmmi = _mm512_mask_unpackhi_epi32(in32_mix.zmmi, k, in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_mask_unpackhi_epi32"); + + res.zmmi = _mm512_maskz_unpackhi_epi32(k, in32.zmmi, in32_neg.zmmi); + CHECK_UNPCKH32(4, in32_mix, k, 1, "_mm512_maskz_unpackhi_epi32"); +} + +void NOINLINE do_shuf_pd() { + V512 res; + V512 expected; + volatile int i, lane; + __mmask8 k = 0xba; +#define PD_IMM 0x7b + + res.zmmd = _mm512_shuffle_pd(in64.zmmd, in64_neg.zmmd, PD_IMM); + + for (lane = 0; lane < 4; lane++) { + int m = 2 * lane; + for (i = 0; i < 1; i++) { + expected.s64[m + i] = in64.s64[m + ((PD_IMM >> m) & 1)]; + expected.s64[m + i + 1] = in64_neg.s64[m + ((PD_IMM >> (m + 1)) & 1)]; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_shuffle_pd", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = + _mm512_mask_shuffle_pd(res.zmmd, k, in64.zmmd, in64_neg.zmmd, PD_IMM); + + expected.zmmi = _mm512_setzero_epi32(); + for (lane = 0; lane < 4; lane++) { + int m = 2 * lane; + for (i = 0; i < 1; i++) { + if ((1 << (m + i)) & k) { + expected.s64[m + i] = in64.s64[m + ((PD_IMM >> m) & 1)]; + } + if ((1 << (m + i + 1)) & k) { + expected.s64[m + i + 1] = in64_neg.s64[m + ((PD_IMM >> (m + 1)) & 1)]; + } + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_pd", __LINE__); +} + +void NOINLINE do_shuf_f32x4() { + V512 res; + V512 expected; + V512 tmp; + volatile int i, j, lane; + __mmask16 k = 0x7e95; +#define F32X4_IMM 0x5c + + res.zmm = _mm512_shuffle_f32x4(in32.zmm, in32_neg.zmm, F32X4_IMM); + + // This code was copied from shuffle_ps, need to update for f32x4. + for (lane = 0; lane < 4; lane++) { + j = ((F32X4_IMM >> 2 * lane) & 0x3); + if (lane < 2) { + expected.xmmi[lane] = in32.xmmi[j]; + } else { + expected.xmmi[lane] = in32_neg.xmmi[j]; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_shuffle_f32x4", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = + _mm512_mask_shuffle_f32x4(res.zmm, k, in32.zmm, in32_neg.zmm, F32X4_IMM); + + expected.zmmi = _mm512_setzero_epi32(); + for (lane = 0; lane < 4; lane++) { + int m = 4 * lane; + j = ((F32X4_IMM >> 2 * lane) & 0x3); + if (lane < 2) { + tmp.xmmi[lane] = in32.xmmi[j]; + } else { + tmp.xmmi[lane] = in32_neg.xmmi[j]; + } + + for (i = 0; i < 4; i++) { + if ((1 << (m + i)) & k) { + expected.s32[m + i] = tmp.s32[m + i]; + } + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_f32x4", __LINE__); +} + +void NOINLINE do_blendmpd() { + V512 res; + V512 expected; + __mmask8 k = 0x4A; + + soft_update(in64_neg); + res.zmmd = _mm512_mask_blend_pd(k, in64.zmmd, in64_neg.zmmd); + CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_pd"); + res.zmmd = _mm512_mask_mov_pd(in64.zmmd, k, in64_neg.zmmd); + CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_pd"); +} + +void NOINLINE do_blendmps() { + V512 res; + V512 expected; + __mmask16 k = 0xA44A; + + res.zmm = _mm512_mask_blend_ps(k, in32.zmm, in32_neg.zmm); + CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_blend_ps"); + res.zmm = _mm512_mask_mov_ps(in32.zmm, k, in32_neg.zmm); + CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_mov_ps"); +} + +int main(int argc, char *argv[]) { + init(); + + do_shuf_f32x4(); + do_shuf_pd(); + do_shuf_ps(); + + do_unpckdq_hi(); + + do_unpckps_lo(); + do_unpckps_hi(); + + do_unpckdq_lo(); + do_unpckqdq_lo(); + + do_unpckpd_lo(); + do_unpckpd_hi(); + + do_unpckqdq_hi(); + + do_blendmd(); + do_blendmq(); + do_blendmpd(); + do_blendmps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/unpack_shuffle.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512F/vpmovdown.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/vpmovdown.c +++ SingleSource/UnitTests/Vector/AVX512F/vpmovdown.c @@ -0,0 +1,1111 @@ +/* + * Test intrinsics related to integer down-converting instructions + * like vpmovdb. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_cvtepi*_epi*() + * _mm512_mask_cvtepi*_epi*() + * _mm512_maskz_cvtepi*_epi*() + * _mm512_mask_cvtepi*_storeu_epi*() + * _mm512_cvtsepi*_epi*() + * _mm512_mask_cvtsepi*_epi*() + * _mm512_maskz_cvtsepi*_epi*() + * _mm512_mask_cvtsepi*_storeu_epi*() + * _mm512_cvtusepi*_epi*() + * _mm512_mask_cvtusepi*_epi*() + * _mm512_maskz_cvtusepi*_epi*() + * _mm512_mask_cvtusepi*_storeu_epi*() + */ + +#include "m512_test_util.h" +#include +#include + +volatile int vol0 = 0; + +#define soft_src_update(var) var.xmmi[vol0] = var.xmmi[vol0] + +V512 i8; +V512 i16; +V512 i32; +V512 i32_mix; +V512 i64; +V512 i64_mix; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = i; + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = i; + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_mix.s32[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + } +} + +void NOINLINE do_pmovdb() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x79ab; + + res.xmmi[0] = _mm512_cvtepi32_epi8(i32.zmmi); + + for (i = 0; i < 16; i++) { + expected.s8[i] = i32.s32[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtepi32_epi8(res.xmmi[1], k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[0] = _mm512_maskz_cvtepi32_epi8(k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtepi32_epi8", __LINE__); + + soft_src_update(i32); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + + _mm512_mask_cvtepi32_storeu_epi8(&(res.xmmi[0]), k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = i32.s32[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi32_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovsdb() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0xbadc; + + res.xmmi[0] = _mm512_cvtsepi32_epi8(i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + expected.s8[i] = (i32_mix.s32[i] < -128) + ? -128 + : ((i32_mix.s32[i] > 127) ? 127 : i32_mix.s32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtsepi32_epi8(res.xmmi[1], k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i32_mix.s32[i] < -128) + ? -128 + : ((i32_mix.s32[i] > 127) ? 127 : i32_mix.s32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm512_maskz_cvtsepi32_epi8(k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i32_mix.s32[i] < -128) + ? -128 + : ((i32_mix.s32[i] > 127) ? 127 : i32_mix.s32[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtsepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtsepi32_storeu_epi8(&(res.xmmi[0]), k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i32_mix.s32[i] < -128) + ? -128 + : ((i32_mix.s32[i] > 127) ? 127 : i32_mix.s32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtsepi32_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovusdb() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x57fd; + + res.xmmi[0] = _mm512_cvtusepi32_epi8(i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + expected.u8[i] = ((i32_mix.u32[i] > 255) ? 255 : i32_mix.u32[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtusepi32_epi8(res.xmmi[1], k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u8[i] = ((i32_mix.u32[i] > 255) ? 255 : i32_mix.u32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[0] = _mm512_maskz_cvtusepi32_epi8(k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u8[i] = ((i32_mix.u32[i] > 255) ? 255 : i32_mix.u32[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtusepi32_epi8", __LINE__); + + soft_src_update(i32_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtusepi32_storeu_epi8(&(res.xmmi[0]), k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u8[i] = ((i32_mix.u32[i] > 255) ? 255 : i32_mix.u32[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi32_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovdw() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x97cd; + + res.ymmi[0] = _mm512_cvtepi32_epi16(i32.zmmi); + + for (i = 0; i < 16; i++) { + expected.s16[i] = i32.s32[i]; + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = i8.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtepi32_epi16(res.ymmi[1], k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } else { + expected.s16[i] = res.s16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.ymmi[0] = _mm512_maskz_cvtepi32_epi16(k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtepi32_epi16", __LINE__); + + soft_src_update(i32); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtepi32_storeu_epi16(&(res.ymmi[0]), k, i32.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = i32.s32[i]; + } else { + expected.s16[i] = res.s16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovsdw() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x63fe; + + res.ymmi[0] = _mm512_cvtsepi32_epi16(i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + expected.s16[i] = + (i32_mix.s32[i] < (int)0xffff8000) + ? (int)0xffff8000 + : ((i32_mix.s32[i] > (int)0x7fff) ? (int)0x7fff : i32_mix.s32[i]); + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[1] = i16.ymmi[0]; + res.ymmi[0] = i16.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtsepi32_epi16(res.ymmi[1], k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = + (i32_mix.s32[i] < (int)0xffff8000) + ? (int)0xffff8000 + : ((i32_mix.s32[i] > (int)0x7fff) ? (int)0x7fff : i32_mix.s32[i]); + } else { + expected.s16[i] = res.s16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[0] = _mm512_maskz_cvtsepi32_epi16(k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = + (i32_mix.s32[i] < (int)0xffff8000) + ? (int)0xffff8000 + : ((i32_mix.s32[i] > (int)0x7fff) ? (int)0x7fff : i32_mix.s32[i]); + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtsepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[1] = i16.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtsepi32_storeu_epi16(&(res.ymmi[0]), k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.s16[i] = + (i32_mix.s32[i] < (int)0xffff8000) + ? (int)0xffff8000 + : ((i32_mix.s32[i] > (int)0x7fff) ? (int)0x7fff : i32_mix.s32[i]); + } else { + expected.s16[i] = res.s16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtsepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovusdw() { + V512 res; + V512 expected; + volatile int i; + __mmask16 k = 0x63fe; + + res.ymmi[0] = _mm512_cvtusepi32_epi16(i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + expected.u16[i] = + (i32_mix.u32[i] > (int)0xffff) ? (int)0xffff : i32_mix.u32[i]; + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[1] = i16.ymmi[0]; + res.ymmi[0] = i16.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtusepi32_epi16(res.ymmi[1], k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u16[i] = + (i32_mix.u32[i] > (int)0xffff) ? (int)0xffff : i32_mix.u32[i]; + } else { + expected.u16[i] = res.u16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[0] = _mm512_maskz_cvtusepi32_epi16(k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u16[i] = + (i32_mix.u32[i] > (int)0xffff) ? (int)0xffff : i32_mix.u32[i]; + } else { + expected.u16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtusepi32_epi16", __LINE__); + + soft_src_update(i32_mix); + res.ymmi[1] = i16.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtusepi32_storeu_epi16(&(res.ymmi[0]), k, i32_mix.zmmi); + + for (i = 0; i < 16; i++) { + if ((1 << i) & k) { + expected.u16[i] = + (i32_mix.u32[i] > (int)0xffff) ? (int)0xffff : i32_mix.u32[i]; + } else { + expected.u16[i] = res.u16[16 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtusepi32_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x76; + + res.xmmi[0] = _mm512_cvtepi64_epi8(i64.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + expected.s8[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtepi64_epi8", __LINE__); + + /* + * Exercise mix with store. + */ + + _mm_store_sd(&res.f64[2], _mm_castsi128_pd(_mm512_cvtepi64_epi8(i64.zmmi))); + check_equal_nd(&res.f64[2], &expected, 2, "_mm512_cvtepi64_epi8 mix with store", + __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtepi64_epi8(res.xmmi[1], k, i64.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi64_epi8", __LINE__); + + soft_src_update(i64); + res.xmmi[0] = _mm512_maskz_cvtepi64_epi8(k, i64.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtepi64_epi8", __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtepi64_storeu_epi8(&(res.xmmi[0]), k, i64.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = i64.s64[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + // Memory-form instruction does not zero high half of result. + check_equal_nd(&res, &expected, 2, "_mm512_mask_cvtepi64_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovsqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x67; + + res.xmmi[0] = _mm512_cvtsepi64_epi8(i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + expected.s8[i] = (i64_mix.s64[i] < -128) + ? -128 + : ((i64_mix.s64[i] > 127) ? 127 : i64_mix.s64[i]); + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtsepi64_epi8(res.xmmi[1], k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i64_mix.s64[i] < -128) + ? -128 + : ((i64_mix.s64[i] > 127) ? 127 : i64_mix.s64[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm512_maskz_cvtsepi64_epi8(k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i64_mix.s64[i] < -128) + ? -128 + : ((i64_mix.s64[i] > 127) ? 127 : i64_mix.s64[i]); + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtsepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtsepi64_storeu_epi8(&(res.xmmi[0]), k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s8[i] = (i64_mix.s64[i] < -128) + ? -128 + : ((i64_mix.s64[i] > 127) ? 127 : i64_mix.s64[i]); + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + // Memory-form instruction does not zero high half of result. + check_equal_nd(&res, &expected, 2, "_mm512_mask_cvtsepi64_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovusqb() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0x67; + + res.xmmi[0] = _mm512_cvtusepi64_epi8(i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtusepi64_epi8(res.xmmi[1], k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm512_maskz_cvtusepi64_epi8(k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i]; + } else { + expected.s8[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtusepi64_epi8", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i16.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtusepi64_storeu_epi8(&(res.xmmi[0]), k, i64_mix.zmmi); + + expected.u64[1] = 0; + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i]; + } else { + expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1]. + } + } + // Memory-form instruction does not zero high half of result. + check_equal_nd(&res, &expected, 2, "_mm512_mask_cvtusepi64_storeu_epi8", + __LINE__); +} + +void NOINLINE do_pmovqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + + res.xmmi[0] = _mm512_cvtepi64_epi16(i64.zmmi); + + for (i = 0; i < 8; i++) { + expected.s16[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtepi64_epi16(res.xmmi[1], k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[0] = _mm512_maskz_cvtepi64_epi16(k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi64_epi16", __LINE__); + + soft_src_update(i64); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtepi64_storeu_epi16(&(res.xmmi[0]), k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s16[i] = i64.s64[i]; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovsqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + __int64 r; + + res.xmmi[0] = _mm512_cvtsepi64_epi16(i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + r = i64_mix.s64[i]; + if (r < (int)0xffff8000) { + r = (int)0xffff8000; + } else if (r > (int)0x7fff) { + r = (int)0x7fff; + } + expected.s16[i] = r; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtsepi64_epi16(res.xmmi[1], k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0xffff8000) { + r = (int)0xffff8000; + } else if (r > (int)0x7fff) { + r = (int)0x7fff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm512_maskz_cvtsepi64_epi16(k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0xffff8000) { + r = (int)0xffff8000; + } else if (r > (int)0x7fff) { + r = (int)0x7fff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_maskz_cvtsepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtsepi64_storeu_epi16(&(res.xmmi[0]), k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0xffff8000) { + r = (int)0xffff8000; + } else if (r > (int)0x7fff) { + r = (int)0x7fff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtsepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovusqw() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xe9; + unsigned __int64 r; + + res.xmmi[0] = _mm512_cvtusepi64_epi16(i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + r = i64_mix.u64[i]; + if (r > 0xffff) { + r = 0xffff; + } + expected.s16[i] = r; + } + + check_equal_nd(&res, &expected, 4, "_mm512_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = i16.xmmi[1]; + res.xmmi[0] = _mm512_mask_cvtusepi64_epi16(res.xmmi[1], k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > 0xffff) { + r = 0xffff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[0] = _mm512_maskz_cvtusepi64_epi16(k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > 0xffff) { + r = 0xffff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = 0; + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi64_epi16", __LINE__); + + soft_src_update(i64_mix); + res.xmmi[1] = i8.xmmi[0]; + res.xmmi[0] = res.xmmi[1]; + _mm512_mask_cvtusepi64_storeu_epi16(&(res.xmmi[0]), k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > 0xffff) { + r = 0xffff; + } + expected.s16[i] = r; + } else { + expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1]. + } + } + + check_equal_nd(&res, &expected, 4, "_mm512_mask_cvtusepi64_storeu_epi16", + __LINE__); +} + +void NOINLINE do_pmovqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + + res.ymmi[0] = _mm512_cvtepi64_epi32(i64.zmmi); + + for (i = 0; i < 8; i++) { + expected.s32[i] = i64.s64[i]; + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = i8.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtepi64_epi32(res.ymmi[1], k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.ymmi[0] = _mm512_maskz_cvtepi64_epi32(k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtepi64_epi32", __LINE__); + + soft_src_update(i64); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtepi64_storeu_epi32(&(res.ymmi[0]), k, i64.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + expected.s32[i] = i64.s64[i]; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtepi64_storeu_epi32", + __LINE__); +} + +void NOINLINE do_pmovsqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + __int64 r; + + res.ymmi[0] = _mm512_cvtsepi64_epi32(i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + r = i64_mix.s64[i]; + if (r < (int)0x80000000) { + r = (int)0x80000000; + } else if (r > (int)0x7fffffff) { + r = (int)0x7fffffff; + } + expected.s32[i] = r; + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = i8.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtsepi64_epi32(res.ymmi[1], k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0x80000000) { + r = (int)0x80000000; + } else if (r > (int)0x7fffffff) { + r = (int)0x7fffffff; + } + expected.s32[i] = r; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[0] = _mm512_maskz_cvtsepi64_epi32(k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0x80000000) { + r = (int)0x80000000; + } else if (r > (int)0x7fffffff) { + r = (int)0x7fffffff; + } + expected.s32[i] = r; + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtsepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtsepi64_storeu_epi32(&(res.ymmi[0]), k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.s64[i]; + if (r < (int)0x80000000) { + r = (int)0x80000000; + } else if (r > (int)0x7fffffff) { + r = (int)0x7fffffff; + } + expected.s32[i] = r; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtsepi64_storeu_epi32", + __LINE__); +} + +void NOINLINE do_pmovusqd() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xcf; + unsigned __int64 r; + + res.ymmi[0] = _mm512_cvtusepi64_epi32(i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + r = i64_mix.u64[i]; + if (r > (unsigned int)0xffffffff) { + r = (unsigned int)0xffffffff; + } + expected.u32[i] = r; + } + + check_equal_nd(&res, &expected, 8, "_mm512_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = i8.ymmi[1]; + res.ymmi[0] = _mm512_mask_cvtusepi64_epi32(res.ymmi[1], k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > (unsigned int)0xffffffff) { + r = (unsigned int)0xffffffff; + } + expected.u32[i] = r; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[0] = _mm512_maskz_cvtusepi64_epi32(k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > (unsigned int)0xffffffff) { + r = (unsigned int)0xffffffff; + } + expected.u32[i] = r; + } else { + expected.s32[i] = 0; + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_maskz_cvtusepi64_epi32", __LINE__); + + soft_src_update(i64_mix); + res.ymmi[1] = i8.ymmi[0]; + res.ymmi[0] = res.ymmi[1]; + _mm512_mask_cvtusepi64_storeu_epi32(&(res.ymmi[0]), k, i64_mix.zmmi); + + for (i = 0; i < 8; i++) { + if ((1 << i) & k) { + r = i64_mix.u64[i]; + if (r > (unsigned int)0xffffffff) { + r = (unsigned int)0xffffffff; + } + expected.u32[i] = r; + } else { + expected.s32[i] = res.s32[8 + i]; // From res.ymmi[1]. + } + } + + check_equal_nd(&res, &expected, 8, "_mm512_mask_cvtusepi64_storeu_epi32", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_pmovdb(); + do_pmovsdb(); + do_pmovusdb(); + + do_pmovdw(); + do_pmovsdw(); + do_pmovusdw(); + + do_pmovqb(); + do_pmovsqb(); + do_pmovusqb(); + + do_pmovqw(); + do_pmovsqw(); + do_pmovusqw(); + + do_pmovqd(); + do_pmovsqd(); + do_pmovusqd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512F/vpmovdown.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512F/vpmovdown.reference_output +++ SingleSource/UnitTests/Vector/AVX512F/vpmovdown.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/CMakeLists.txt +++ SingleSource/UnitTests/Vector/CMakeLists.txt @@ -1,3 +1,4 @@ +set(VECTOR_MAIN_DIR SingleSource/UnitTests/Vector) if(ARCH STREQUAL "PowerPC") add_subdirectory(Altivec) endif() @@ -16,6 +17,8 @@ if(ARCH STREQUAL "x86") if(X86CPU_ARCH STREQUAL "skylake-avx512") add_subdirectory(AVX512F) + add_subdirectory(AVX512BWVL) + add_subdirectory(AVX512DQ) endif() if(X86CPU_ARCH STREQUAL "knl") add_subdirectory(AVX512F) Index: SingleSource/UnitTests/Vector/Makefile =================================================================== --- SingleSource/UnitTests/Vector/Makefile +++ SingleSource/UnitTests/Vector/Makefile @@ -20,6 +20,12 @@ ifeq ($(HAVE_X86_AVX512F_INSTRUCTIONS), 1) DIRS += AVX512F endif +ifeq ($(HAVE_X86_AVX512BW_INSTRUCTIONS), 1) +DIRS += AVX512BWVL +endif +ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1) +DIRS += AVX512DQ +endif endif # Assume ARMv7 implies NEON. Index: SingleSource/UnitTests/Vector/m512_test_util.h =================================================================== --- SingleSource/UnitTests/Vector/m512_test_util.h +++ SingleSource/UnitTests/Vector/m512_test_util.h @@ -0,0 +1,327 @@ +#ifndef M512_TEST_UTIL_H_INCLUDED +#define M512_TEST_UTIL_H_INCLUDED + +/* + * Common declarations useful for writing 512-bit unit tests. + */ + +#include +#include +#include +#include + +#define ALIGNTO(n) __declspec(align(n)) + +/* + * For purposes of unit tests it can be beneficial to suppress inlining + * simply so that only a single instance of a test function is emitted. + * Makes it easier to diff A/B assembly output. + */ +#define NOINLINE __declspec(noinline) + +/* + * FULL_IREG(ax) expands to either eax or rax depending on the target. + */ +#if defined(__x86_64) || defined(_M_X64) +#define FULL_IREG(reg) r##reg +#else +#define FULL_IREG(reg) e##reg +#endif + +/* Number of elements in an array. */ +#define ASIZE(a) (sizeof((a)) / sizeof((a)[0])) + +typedef __int64 I64; +typedef unsigned __int64 U64; + +typedef union ALIGNTO(64) { + + __m512 zmm; + __m512d zmmd; + __m512i zmmi; + + __m256 ymm[2]; + __m256d ymmd[2]; + __m256i ymmi[2]; + + __m128 xmm[4]; + __m128d xmmd[4]; + __m128i xmmi[4]; + + char c[64]; + signed char s8[64]; + unsigned char u8[64]; + short s16[32]; + unsigned short u16[32]; + int s32[16]; + unsigned int u32[16]; + float f32[16]; + I64 s64[8]; + U64 u64[8]; + double f64[8]; + +} V512; + +int n_errs = 0; + +/* + * Print the low N 8-bit unsigned integers from p. + */ + +void NOINLINE display_pb(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.2x", p->u8[i]); + if (i > 0 && i % 16 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N 16-bit unsigned integers from p. + */ + +void NOINLINE display_pw(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.4x", p->u16[i]); + if (i > 0 && i % 8 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N 32-bit unsigned integers from p. + */ + +void NOINLINE display_pd(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.8x", p->u32[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N 64-bit unsigned integers from p. + */ +void NOINLINE display_pq(const V512 *p, const char *banner, int n_elems) { + int i = 7; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.16llx", p->u64[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N single precision floats from p. + */ + +void NOINLINE display_psf(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %7g", p->f32[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N double precision floats from p. + */ + +void NOINLINE display_pdf(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %7g", p->f64[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Check that the low N 16-bit elements of "got" and "expected" are the same. + */ +int NOINLINE check_equal_nw(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u16[i] != v2->u16[i]) { + printf("ERROR(%d): %s failed at %d'th element: 0x%0.4x != 0x%0.4x\n", + line, banner ? banner : "", i, v1->u16[i], v2->u16[i]); + display_pw(got, "got:", n_elems); + display_pw(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +/* + * Check that the low N 32-bit elements of "got" and "expected" are the same. + */ +int NOINLINE check_equal_nd(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u32[i] != v2->u32[i]) { + printf("ERROR(%d): %s failed at %d'th element: 0x%0.8x != 0x%0.8x\n", + line, banner ? banner : "", i, v1->u32[i], v2->u32[i]); + display_pd(got, "got:", n_elems); + display_pd(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +/* + * Check that the low N 64-bit elements of "got" and "expected" are the same. + */ +int NOINLINE check_equal_nq(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u64[i] != v2->u64[i]) { + printf( + "ERROR(%d): %s failed at %d'th element: 0x%0.16llx != 0x%0.16llx\n", + line, banner ? banner : "", i, v1->u64[i], v2->u64[i]); + display_pq(got, "got:", n_elems); + display_pq(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +double delta = 1e-4; + +#define EQUAL_FP(v1, v2) \ + ((v1) < (v2) ? ((v2) - (v1) < delta) : ((v1) - (v2) < delta)) + +/* + * Check that the low N single precision float elements of "got" and "expected" + * are the same. + */ +int NOINLINE check_equal_nsf(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (!EQUAL_FP(v1->f32[i], v2->f32[i])) { + printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, + banner ? banner : "", i, v1->f32[i], v2->f32[i]); + display_psf(got, "got:", n_elems); + display_psf(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +/* + * Check that the low N double precision float elements of "got" and "expected" + * are the same. + */ +int NOINLINE check_equal_ndf(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (!EQUAL_FP(v1->f64[i], v2->f64[i])) { + printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, + banner ? banner : "", i, v1->f64[i], v2->f64[i]); + display_pdf(got, "got:", n_elems); + display_pdf(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +#endif /* M512_TEST_UTIL_H_INCLUDED */