Index: SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt +++ SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt @@ -0,0 +1,4 @@ +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS -march=skylake-avx512) +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Vector-AVX512-") Index: SingleSource/UnitTests/Vector/AVX512/Makefile =================================================================== --- SingleSource/UnitTests/Vector/AVX512/Makefile +++ SingleSource/UnitTests/Vector/AVX512/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Vector/AVX512/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=skylake-avx512 +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=skylake-avx512 +LCCFLAGS += -march=skylake-avx512 Index: SingleSource/UnitTests/Vector/AVX512/abs.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/abs.c +++ SingleSource/UnitTests/Vector/AVX512/abs.c @@ -0,0 +1,65 @@ +/* + * Test absolute value intrinsics. + * Here we check for _mm512_abs_ps and _mm512_abs_pd intrinsics. + */ + +#include "m512_test_util.h" +#include + +V512 f32; +V512 f64; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + f32.f32[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 8; i++) { + f64.f64[i] = (i & 1) ? -i : i; + } +} + +void NOINLINE do_abs_ps() { + V512 res; + V512 expected; + volatile int i; + + res.zmm = _mm512_abs_ps(f32.zmm); + + for (i = 0; i < 16; i++) { + expected.s32[i] = f32.s32[i] & 0x7fffffff; + } + + check_equal_nd(&res, &expected, 16, "_mm512_abs_ps", __LINE__); +} + +void NOINLINE do_abs_pd() { + V512 res; + V512 expected; + volatile int i; + + res.zmmd = _mm512_abs_pd(f64.zmmd); + + for (i = 0; i < 8; i++) { + expected.s64[i] = f64.s64[i] & 0x7fffffffffffffff; + } + + check_equal_nd(&res, &expected, 16, "_mm512_abs_pd", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_abs_ps(); + do_abs_pd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/abs.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/abs.reference_output +++ SingleSource/UnitTests/Vector/AVX512/abs.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/fma.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/fma.c +++ SingleSource/UnitTests/Vector/AVX512/fma.c @@ -0,0 +1,827 @@ +/* + * Test fma(fmadd, fmsub, fnmadd, fnmsub) instructions. + * Here we check for _mm512_[mask|mask3]_f[madd|msub|nmadd|nmsub]_[round] + * intrinsics. + */ + +#include "m512_test_util.h" +#include +#include + +int verbose = 0; + +__m512i i1; +__m512i i2; +__m512i i3; +__m512i i4; +__m512i i5; + +__m512 f1; +__m512 f2; +__m512 f3; +__m512 f4; +__m512 f5; + +__m512d d1; +__m512d d2; +__m512d d3; +__m512d d4; +__m512d d5; + +typedef enum { FMA_233, FMA_132, FMA_231, FMA_213, FMA_23c1 } Fma_order; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE init() { + int i; + V512 *pi1 = (V512 *)&i1; + V512 *pi2 = (V512 *)&i2; + V512 *pi3 = (V512 *)&i3; + V512 *pf1 = (V512 *)&f1; + V512 *pf2 = (V512 *)&f2; + V512 *pf3 = (V512 *)&f3; + V512 *pd1 = (V512 *)&d1; + V512 *pd2 = (V512 *)&d2; + V512 *pd3 = (V512 *)&d3; + + for (i = 0; i < 16; i++) { + pi1->s32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol; + pf1->f32[i] = pi1->s32[i]; + + pi2->s32[i] = 100 + ((i & 3) == 3 ? 1 : -1) * i + vol; + pf2->f32[i] = -pi2->s32[i]; + + pi3->s32[i] = 400 + ((i & 1) ? -1 : 1) * i + vol; + pf3->f32[i] = pi3->s32[i]; + } + + for (i = 0; i < 8; i++) { + pd1->f64[i] = pi1->s32[i]; + pd2->f64[i] = pi2->s32[i]; + pd3->f64[i] = -pi3->s32[i]; + } +} + +void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 16; i++) { + int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i]; + if (got->s32[i] != ans) { + printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n", + banner ? banner : "", got->s32[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 8; i++) { + __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i]; + if (got->s64[i] != ans) { + printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64 + " at element [%d]\n", + banner ? banner : "", got->s64[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE emulate_fmadd_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f32[i] = + v2->f32[i] * v3->f32[(i & ~0x3) + 1] + v3->f32[(i & ~0x3)]; + break; + + case FMA_132: + result->f32[i] = v1->f32[i] * v3->f32[i] + v2->f32[i]; + break; + + case FMA_231: + result->f32[i] = v2->f32[i] * v3->f32[i] + v1->f32[i]; + break; + + case FMA_213: + result->f32[i] = v2->f32[i] * v1->f32[i] + v3->f32[i]; + break; + + case FMA_23c1: + result->f32[i] = (v2->f32[i] * v3->f32[i]) + 1.0f; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmsub_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f32[i] = + v2->f32[i] * v3->f32[(i & ~0x3) + 1] - v3->f32[(i & ~0x3)]; + break; + + case FMA_132: + result->f32[i] = v1->f32[i] * v3->f32[i] - v2->f32[i]; + break; + + case FMA_231: + result->f32[i] = v2->f32[i] * v3->f32[i] - v1->f32[i]; + break; + + case FMA_213: + result->f32[i] = v2->f32[i] * v1->f32[i] - v3->f32[i]; + break; + + case FMA_23c1: + result->f32[i] = (v2->f32[i] * v3->f32[i]) - 1.0f; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fnmadd_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f32[i] = + -(v2->f32[i] * v3->f32[(i & ~0x3) + 1]) + v3->f32[(i & ~0x3)]; + break; + + case FMA_132: + result->f32[i] = -(v1->f32[i] * v3->f32[i]) + v2->f32[i]; + break; + + case FMA_231: + result->f32[i] = -(v2->f32[i] * v3->f32[i]) + v1->f32[i]; + break; + + case FMA_213: + result->f32[i] = -(v2->f32[i] * v1->f32[i]) + v3->f32[i]; + break; + + case FMA_23c1: + result->f32[i] = -(v2->f32[i] * v3->f32[i]) + 1.0f; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fnmsub_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f32[i] = + -(v2->f32[i] * v3->f32[(i & ~0x3) + 1]) - v3->f32[(i & ~0x3)]; + break; + + case FMA_132: + result->f32[i] = -(v1->f32[i] * v3->f32[i]) - v2->f32[i]; + break; + + case FMA_231: + result->f32[i] = -(v2->f32[i] * v3->f32[i]) - v1->f32[i]; + break; + + case FMA_213: + result->f32[i] = -(v2->f32[i] * v1->f32[i]) - v3->f32[i]; + break; + + case FMA_23c1: + result->f32[i] = -(v2->f32[i] * v3->f32[i]) - 1.0f; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmadd_pi(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_233: + result->s32[i] = + v2->s32[i] * v3->s32[(i & ~0x3) + 1] + v3->s32[(i & ~0x3)]; + break; + + case FMA_132: + result->s32[i] = v1->s32[i] * v3->s32[i] + v2->s32[i]; + break; + + case FMA_231: + result->s32[i] = v2->s32[i] * v3->s32[i] + v1->s32[i]; + break; + + case FMA_213: + result->s32[i] = v2->s32[i] * v1->s32[i] + v3->s32[i]; + break; + + case FMA_23c1: + result->s32[i] = v2->s32[i] * v3->s32[i] + 1; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmadd_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f64[i] = + v2->f64[i] * v3->f64[(i & ~0x3) + 1] + v3->f64[(i & ~0x3)]; + break; + + case FMA_132: + result->f64[i] = v1->f64[i] * v3->f64[i] + v2->f64[i]; + break; + + case FMA_231: + result->f64[i] = v2->f64[i] * v3->f64[i] + v1->f64[i]; + break; + + case FMA_213: + result->f64[i] = v2->f64[i] * v1->f64[i] + v3->f64[i]; + break; + + case FMA_23c1: + result->f64[i] = v2->f64[i] * v3->f64[i] + 1.0; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmsub_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f64[i] = + v2->f64[i] * v3->f64[(i & ~0x3) + 1] - v3->f64[(i & ~0x3)]; + break; + + case FMA_132: + result->f64[i] = v1->f64[i] * v3->f64[i] - v2->f64[i]; + break; + + case FMA_231: + result->f64[i] = v2->f64[i] * v3->f64[i] - v1->f64[i]; + break; + + case FMA_213: + result->f64[i] = v2->f64[i] * v1->f64[i] - v3->f64[i]; + break; + + case FMA_23c1: + result->f64[i] = v2->f64[i] * v3->f64[i] - 1.0; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fnmadd_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f64[i] = + -(v2->f64[i] * v3->f64[(i & ~0x3) + 1]) + v3->f64[(i & ~0x3)]; + break; + + case FMA_132: + result->f64[i] = -(v1->f64[i] * v3->f64[i]) + v2->f64[i]; + break; + + case FMA_231: + result->f64[i] = -(v2->f64[i] * v3->f64[i]) + v1->f64[i]; + break; + + case FMA_213: + result->f64[i] = -(v2->f64[i] * v1->f64[i]) + v3->f64[i]; + break; + + case FMA_23c1: + result->f64[i] = -(v2->f64[i] * v3->f64[i]) + 1.0; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fnmsub_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_233: + result->f64[i] = + -(v2->f64[i] * v3->f64[(i & ~0x3) + 1]) - v3->f64[(i & ~0x3)]; + break; + + case FMA_132: + result->f64[i] = -(v1->f64[i] * v3->f64[i]) - v2->f64[i]; + break; + + case FMA_231: + result->f64[i] = -(v2->f64[i] * v3->f64[i]) - v1->f64[i]; + break; + + case FMA_213: + result->f64[i] = -(v2->f64[i] * v1->f64[i]) - v3->f64[i]; + break; + + case FMA_23c1: + result->f64[i] = -(v2->f64[i] * v3->f64[i]) - 1.0; + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE do_fmadd_ps() { + f4 = _mm512_fmadd_ps(f1, f2, f3); + emulate_fmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmadd_ps"); + + f4 = _mm512_mask_fmadd_ps(f1, 0x79fa, f2, f3); + emulate_fmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmadd_ps"); + + f4 = _mm512_mask3_fmadd_ps(f1, f2, f3, 0x563a); + emulate_fmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmadd_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fmadd_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmadd_round_ps"); + + f4 = _mm512_mask_fmadd_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmadd_round_ps"); + + f4 = _mm512_mask3_fmadd_round_ps(f1, f2, f3, 0x563a, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmadd_round_ps"); +} + +void NOINLINE do_fnmsub_ps() { + f4 = _mm512_fnmsub_ps(f1, f2, f3); + emulate_fnmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmsub_ps"); + + f4 = _mm512_mask_fnmsub_ps(f1, 0x79fa, f2, f3); + emulate_fnmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmsub_ps"); + + f4 = _mm512_mask3_fnmsub_ps(f1, f2, f3, 0x563a); + emulate_fnmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmsub_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fnmsub_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fnmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmsub_round_ps"); + + f4 = _mm512_mask_fnmsub_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fnmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmsub_round_ps"); + + f4 = _mm512_mask3_fnmsub_round_ps(f1, f2, f3, 0x563a, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fnmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmsub_round_ps"); +} + +void NOINLINE do_fmadd_pd() { + d4 = _mm512_fmadd_pd(d1, d2, d3); + emulate_fmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmadd_pd"); + + d4 = _mm512_mask_fmadd_pd(d1, 0xfa, d2, d3); + emulate_fmadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmadd_pd"); + + d4 = _mm512_mask3_fmadd_pd(d1, d2, d3, 0x56); + emulate_fmadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmadd_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fmadd_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmadd_round_pd"); + + d4 = _mm512_mask_fmadd_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmadd_round_pd"); + + d4 = _mm512_mask3_fmadd_round_pd(d1, d2, d3, 0x63, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmadd_round_pd"); +} + +void NOINLINE do_fnmsub_pd() { + d4 = _mm512_fnmsub_pd(d1, d2, d3); + emulate_fnmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmsub_pd"); + + d4 = _mm512_mask_fnmsub_pd(d1, 0xfa, d2, d3); + emulate_fnmsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fnmsub_pd"); + + d4 = _mm512_mask3_fnmsub_pd(d1, d2, d3, 0x56); + emulate_fnmsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fnmsub_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fnmsub_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fnmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmsub_round_pd"); + + d4 = _mm512_mask_fnmsub_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fnmsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fnmsub_round_pd"); + + d4 = _mm512_mask3_fnmsub_round_pd(d1, d2, d3, 0x63, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fnmsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fnmsub_round_pd"); +} + +void NOINLINE do_fmsub_ps() { + f4 = _mm512_fmsub_ps(f1, f2, f3); + emulate_fmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsub_ps"); + + f4 = _mm512_mask_fmsub_ps(f1, 0x79fa, f2, f3); + emulate_fmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsub_ps"); + + f4 = _mm512_mask3_fmsub_ps(f1, f2, f3, 0x563a); + emulate_fmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsub_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fmsub_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsub_round_ps"); + + f4 = _mm512_mask_fmsub_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsub_round_ps"); + + f4 = _mm512_mask3_fmsub_round_ps(f1, f2, f3, 0x563a, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsub_round_ps"); +} + +void NOINLINE do_fnmadd_ps() { + f4 = _mm512_fnmadd_ps(f1, f2, f3); + emulate_fnmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmadd_ps"); + + f4 = _mm512_mask_fnmadd_ps(f1, 0x79fa, f2, f3); + emulate_fnmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmadd_ps"); + + f4 = _mm512_mask3_fnmadd_ps(f1, f2, f3, 0x563a); + emulate_fnmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmadd_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fnmadd_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fnmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmadd_round_ps"); + + f4 = _mm512_mask_fnmadd_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fnmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmadd_round_ps"); + + f4 = _mm512_mask3_fnmadd_round_ps(f1, f2, f3, 0x563a, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fnmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmadd_round_ps"); +} + +void NOINLINE do_fmsub_pd() { + d4 = _mm512_fmsub_pd(d1, d2, d3); + emulate_fmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsub_pd"); + + d4 = _mm512_mask_fmsub_pd(d1, 0xfa, d2, d3); + emulate_fmsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmsub_pd"); + + d4 = _mm512_mask3_fmsub_pd(d1, d2, d3, 0x56); + emulate_fmsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmsub_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fmsub_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsub_round_pd"); + + d4 = _mm512_mask_fmsub_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmsub_round_pd"); + + d4 = _mm512_mask3_fmsub_round_pd(d1, d2, d3, 0x63, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmsub_round_pd"); +} + +void NOINLINE do_fnmadd_pd() { + d4 = _mm512_fnmadd_pd(d1, d2, d3); + emulate_fnmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmadd_pd"); + + d4 = _mm512_mask_fnmadd_pd(d1, 0xfa, d2, d3); + emulate_fnmadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fnmadd_pd"); + + d4 = _mm512_mask3_fnmadd_pd(d1, d2, d3, 0x56); + emulate_fnmadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fnmadd_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fnmadd_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fnmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmadd_round_pd"); + + d4 = _mm512_mask_fnmadd_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fnmadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fnmadd_round_pd"); + + d4 = _mm512_mask3_fnmadd_round_pd(d1, d2, d3, 0x63, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fnmadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fnmadd_round_pd"); +} + +int main(int argc, char *argv[]) { + if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' && + argv[1][2] == '\0') { + verbose = 1; + } + + init(); + + do_fmadd_ps(); + + do_fmadd_pd(); + + do_fnmsub_ps(); + + do_fnmsub_pd(); + + do_fmsub_ps(); + + do_fmsub_pd(); + + do_fnmadd_ps(); + + do_fnmadd_pd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/fma.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/fma.reference_output +++ SingleSource/UnitTests/Vector/AVX512/fma.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/fma_addsub.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/fma_addsub.c +++ SingleSource/UnitTests/Vector/AVX512/fma_addsub.c @@ -0,0 +1,411 @@ +/* + * Test addsub and subadd instructions. + * Here we check for _mm512_[mask|mask3]_[fmaddsub|fmsubadd]_[round] + * intrinsics. + */ +#include "m512_test_util.h" +#include +#include + +int verbose = 0; + +__m512 f1; +__m512 f2; +__m512 f3; +__m512 f4; +__m512 f5; + +__m512d d1; +__m512d d2; +__m512d d3; +__m512d d4; +__m512d d5; + +typedef enum { + FMA_132, + FMA_231, + FMA_213, +} Fma_order; + +volatile int vol = 0; /* To prevent optimizations */ + +void NOINLINE init() { + int i; + V512 *pf1 = (V512 *)&f1; + V512 *pf2 = (V512 *)&f2; + V512 *pf3 = (V512 *)&f3; + V512 *pd1 = (V512 *)&d1; + V512 *pd2 = (V512 *)&d2; + V512 *pd3 = (V512 *)&d3; + + for (i = 0; i < 16; i++) { + pf1->f32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol; + pf2->f32[i] = -(100 + ((i & 3) == 3 ? 1 : -1) * i + vol); + pf3->f32[i] = 400 + ((i & 1) ? -1 : 1) * i + vol; + } + + for (i = 0; i < 8; i++) { + pd1->f64[i] = pf1->f32[i]; + pd2->f64[i] = -pf2->f32[i]; + pd3->f64[i] = -pf3->f32[i]; + } +} + +void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 16; i++) { + int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i]; + if (got->s32[i] != ans) { + printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n", + banner ? banner : "", got->s32[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig, + int mask, char *banner) { + int i; + V512 *got = (V512 *)vgot; + V512 *expected = (V512 *)vexpected; + V512 *orig = (V512 *)vexpected_orig; + + for (i = 0; i < 8; i++) { + __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i]; + if (got->s64[i] != ans) { + printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64 + " at element [%d]\n", + banner ? banner : "", got->s64[i], ans, i); + n_errs++; + break; + } + } +} + +void NOINLINE emulate_fmaddsub_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_132: + result->f32[i] = + v1->f32[i] * v3->f32[i] + ((i % 2 == 0) ? -v2->f32[i] : v2->f32[i]); + break; + + case FMA_231: + result->f32[i] = + v2->f32[i] * v3->f32[i] + ((i % 2 == 0) ? -v1->f32[i] : v1->f32[i]); + break; + + case FMA_213: + result->f32[i] = + v2->f32[i] * v1->f32[i] + ((i % 2 == 0) ? -v3->f32[i] : v3->f32[i]); + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmsubadd_ps(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 16; i++) { + + if (((1 << i) & mask) == 0) { + result->u32[i] = v1->u32[i]; + continue; + } + + switch (order) { + case FMA_132: + result->f32[i] = + v1->f32[i] * v3->f32[i] + ((i % 2 == 0) ? v2->f32[i] : -v2->f32[i]); + break; + + case FMA_231: + result->f32[i] = + v2->f32[i] * v3->f32[i] + ((i % 2 == 0) ? v1->f32[i] : -v1->f32[i]); + break; + + case FMA_213: + result->f32[i] = + v2->f32[i] * v1->f32[i] + ((i % 2 == 0) ? v3->f32[i] : -v3->f32[i]); + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmaddsub_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_132: + result->f64[i] = + v1->f64[i] * v3->f64[i] + ((i % 2 == 0) ? -v2->f64[i] : v2->f64[i]); + break; + + case FMA_231: + result->f64[i] = + v2->f64[i] * v3->f64[i] + ((i % 2 == 0) ? -v1->f64[i] : v1->f64[i]); + break; + + case FMA_213: + result->f64[i] = + v2->f64[i] * v1->f64[i] + ((i % 2 == 0) ? -v3->f64[i] : v3->f64[i]); + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE emulate_fmsubadd_pd(void *presult, const void *p1, int mask, + const void *p2, const void *p3, + Fma_order order) { + int i; + V512 *result = (V512 *)presult; + V512 *v1 = (V512 *)p1; + V512 *v2 = (V512 *)p2; + V512 *v3 = (V512 *)p3; + + for (i = 0; i < 8; i++) { + + if (((1 << i) & mask) == 0) { + result->u64[i] = v1->u64[i]; + continue; + } + + switch (order) { + case FMA_132: + result->f64[i] = + v1->f64[i] * v3->f64[i] + ((i % 2 == 0) ? v2->f64[i] : -v2->f64[i]); + break; + + case FMA_231: + result->f64[i] = + v2->f64[i] * v3->f64[i] + ((i % 2 == 0) ? v1->f64[i] : -v1->f64[i]); + break; + + case FMA_213: + result->f64[i] = + v2->f64[i] * v1->f64[i] + ((i % 2 == 0) ? v3->f64[i] : -v3->f64[i]); + break; + + default: + printf("ERROR -- bad fma order %d\n", (int)order); + n_errs++; + return; + } + } +} + +void NOINLINE do_fmaddsub_ps() { + f4 = _mm512_fmaddsub_ps(f1, f2, f3); + emulate_fmaddsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmaddsub_ps"); + + f4 = _mm512_mask_fmaddsub_ps(f1, 0x79fa, f2, f3); + emulate_fmaddsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmaddsub_ps"); + + f4 = _mm512_mask3_fmaddsub_ps(f1, f2, f3, 0x563a); + emulate_fmaddsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmaddsub_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fmaddsub_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmaddsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmaddsub_round_ps"); + + f4 = _mm512_mask_fmaddsub_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmaddsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmaddsub_round_ps"); + + f4 = _mm512_mask3_fmaddsub_round_ps( + f1, f2, f3, 0x563a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmaddsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmaddsub_round_ps"); +} + +void NOINLINE do_fmaddsub_pd() { + d4 = _mm512_fmaddsub_pd(d1, d2, d3); + emulate_fmaddsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmaddsub_pd"); + + d4 = _mm512_mask_fmaddsub_pd(d1, 0xfa, d2, d3); + emulate_fmaddsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmaddsub_pd"); + + d4 = _mm512_mask3_fmaddsub_pd(d1, d2, d3, 0x56); + emulate_fmaddsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmaddsub_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fmaddsub_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmaddsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmaddsub_round_pd"); + + d4 = _mm512_mask_fmaddsub_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmaddsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmaddsub_round_pd"); + + d4 = _mm512_mask3_fmaddsub_round_pd( + d1, d2, d3, 0x63, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmaddsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmaddsub_round_pd"); +} + +void NOINLINE do_fmsubadd_ps() { + f4 = _mm512_fmsubadd_ps(f1, f2, f3); + emulate_fmsubadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsubadd_ps"); + + f4 = _mm512_mask_fmsubadd_ps(f1, 0x79fa, f2, f3); + emulate_fmsubadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsubadd_ps"); + + f4 = _mm512_mask3_fmsubadd_ps(f1, f2, f3, 0x563a); + emulate_fmsubadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsubadd_ps"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + f4 = _mm512_fmsubadd_round_ps(f1, f2, f3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmsubadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsubadd_round_ps"); + + f4 = _mm512_mask_fmsubadd_round_ps(f1, 0x79fa, f2, f3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmsubadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132); + check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsubadd_round_ps"); + + f4 = _mm512_mask3_fmsubadd_round_ps( + f1, f2, f3, 0x563a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmsubadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231); + check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsubadd_round_ps"); +} + +void NOINLINE do_fmsubadd_pd() { + d4 = _mm512_fmsubadd_pd(d1, d2, d3); + emulate_fmsubadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsubadd_pd"); + + d4 = _mm512_mask_fmsubadd_pd(d1, 0xfa, d2, d3); + emulate_fmsubadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmsubadd_pd"); + + d4 = _mm512_mask3_fmsubadd_pd(d1, d2, d3, 0x56); + emulate_fmsubadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmsubadd_pd"); + + /* + * Employ rounding modes. + * Our FP inputs are all integer values, so there's no need for any + * special emulation routine. + */ + + d4 = _mm512_fmsubadd_round_pd(d1, d2, d3, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + emulate_fmsubadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsubadd_round_pd"); + + d4 = _mm512_mask_fmsubadd_round_pd(d1, 0x79, d2, d3, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + emulate_fmsubadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132); + check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmsubadd_round_pd"); + + d4 = _mm512_mask3_fmsubadd_round_pd( + d1, d2, d3, 0x63, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + emulate_fmsubadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231); + check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmsubadd_round_pd"); +} + +int main(int argc, char *argv[]) { + if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' && + argv[1][2] == '\0') { + verbose = 1; + } + + init(); + + do_fmaddsub_ps(); + do_fmaddsub_pd(); + + do_fmsubadd_ps(); + do_fmsubadd_pd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output +++ SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/imul.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/imul.c +++ SingleSource/UnitTests/Vector/AVX512/imul.c @@ -0,0 +1,103 @@ +/* + * Test various integer multiply intrinsics. + * Here we check for _mm512_[mask]mul_ep[i|u]32 intrinsics. + */ + +#include "m512_test_util.h" +#include +#include + +volatile int vol0 = 0; + +V512 i64; +V512 i64_mix; +V512 i64_big; + +void NOINLINE init() { + volatile int i; + + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + i64_big.s64[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i64_big.s64[i] = -i64_big.s64[i]; + } + } +} + +void NOINLINE do_muldq() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmmi = _mm512_mul_epi32(i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + expected.s64[i] = (I64)i64_mix.s32[2 * i] * i64_big.s32[2 * i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mul_epi32", __LINE__); + + /* + * No-op to inhibit PRE of i64_big, thus enabling localized ciscization. + */ + i64_big.xmm[vol0] = i64_big.xmm[vol0]; + + k = 0xcd; + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_mul_epi32(res.zmmi, k, i64.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + expected.s64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s64[i] = (I64)i64.s32[2 * i] * i64_big.s32[2 * i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_mul_epi32", __LINE__); +} + +void NOINLINE do_muludq() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmmi = _mm512_mul_epu32(i64_mix.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + expected.u64[i] = (U64)i64_mix.u32[2 * i] * i64_big.u32[2 * i]; + } + check_equal_nd(&res, &expected, 16, "_mm512_mul_epu32", __LINE__); + + /* + * No-op to inhibit PRE of i64_big, thus enabling localized ciscization. + */ + i64_big.xmm[vol0] = i64_big.xmm[vol0]; + + k = 0xcd; + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_mul_epu32(res.zmmi, k, i64.zmmi, i64_big.zmmi); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (U64)i64.u32[2 * i] * i64_big.u32[2 * i]; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_mul_epu32", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_muldq(); + do_muludq(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/imul.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/imul.reference_output +++ SingleSource/UnitTests/Vector/AVX512/imul.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c +++ SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c @@ -0,0 +1,240 @@ +#include "m512_test_util.h" +#include +#include +#include + +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_add_pd() + * _mm512_max_pd() + * _mm512_min_pd() + * _mm512_mask_max_pd() + * _mm512_mask_min_pd() + * _mm512_mask_mul_pd() + * _mm512_mask_abs_pd() + * _mm512_add_round_pd() + * _mm512_sub_round_pd() + */ + +int show_op = 0; + +typedef enum { ASSIGN, ABS, ADD, MAX, MIN, MUL, SUB } OPER; + +static void NOINLINE intop(OPER op, double ivalout[8], double ivalop1[8], + double ivalop2[8]) { + int i; + int handled = 0; + + memset(ivalout, 0, sizeof(ivalout)); + for (i = 0; i < 8; i += 1) { + switch (op) { + case ASSIGN: + handled = 1; + ivalout[i] = ivalop1[i]; + break; + case ABS: + handled = 1; + ivalout[i] = ivalop1[i] >= 0 ? ivalop1[i] : -ivalop1[i]; + break; + case ADD: + handled = 1; + ivalout[i] = ivalop1[i] + ivalop2[i]; + break; + case MAX: + handled = 1; + ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MIN: + handled = 1; + ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MUL: + handled = 1; + ivalout[i] = ivalop2[i] * ivalop1[i]; + break; + case SUB: + handled = 1; + ivalout[i] = ivalop1[i] - ivalop2[i]; + break; + default: + printf("FAIL: bad op\n"); + break; + } + } + if (!handled) { + printf("FAIL: unsupported op\n"); + n_errs++; + } +} + +static int NOINLINE check(double val1[], double good[]) { + int i; + int res = 1; + for (i = 0; i < 8; i += 1) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + return (res); +} + +static int NOINLINE check_mask(double val1[], double good[], int mask) { + int i; + int res = 1; + for (i = 0; i < 8; i += 1) { + if ((1 << i) & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + } + return (res); +} + +static void NOINLINE print_vec(char *pfx, double ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]); + printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]); +} + +#define DOONE(OP, FUNC) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.zmmd = FUNC(v1.zmmd, v2.zmmd); \ + passed = check(vvv.f64, good.f64); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_MASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd, v2.zmmd); \ + passed = check_mask(vvv.f64, good.f64, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_MASK_1OP(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd); \ + passed = check_mask(vvv.f64, good.f64, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_ROUND(OP, FUNC, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.zmmd = FUNC(v1.zmmd, v2.zmmd, ROUND); \ + passed = check(vvv.f64, good.f64); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd, v2.zmmd, ROUND); \ + passed = check_mask(vvv.f64, good.f64, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +int main() { + double init1[] = {1, 2, -3, 4, 5, -6, 7, 8}; + double init2[] = {11, 12, 23, 24, 35, 36, 17, 38}; + + V512 v1; + V512 v2; + V512 good; + V512 vvv; + + intop(ASSIGN, v1.f64, init1, 0); + intop(ASSIGN, v2.f64, init2, 0); + + // simple intrinsics + DOONE(ADD, _mm512_add_pd); + DOONE(MAX, _mm512_max_pd); + DOONE(MIN, _mm512_min_pd); + DOONE(MUL, _mm512_mul_pd); + DOONE(SUB, _mm512_sub_pd); + + DOONE_WITH_MASK(ADD, _mm512_mask_add_pd, 0x07); + DOONE_WITH_MASK(MAX, _mm512_mask_max_pd, 0x01); + DOONE_WITH_MASK(MIN, _mm512_mask_min_pd, 0x03); + DOONE_WITH_MASK(MUL, _mm512_mask_mul_pd, 0xf0); + DOONE_WITH_MASK(SUB, _mm512_mask_sub_pd, 0x9f); + + DOONE_WITH_MASK_1OP(ABS, _mm512_mask_abs_pd, 0xf4); + + // intrinsics with rounding mode + DOONE_ROUND(ADD, _mm512_add_round_pd, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + DOONE_ROUND(SUB, _mm512_sub_round_pd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + DOONE_WITH_MASK_ROUND(ADD, _mm512_mask_add_round_pd, 0x07, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(SUB, _mm512_mask_sub_round_pd, 0xf0, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output +++ SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c +++ SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c @@ -0,0 +1,236 @@ +#include "m512_test_util.h" +#include +#include +#include + +/* + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_add_ps() + * _mm512_max_ps() + * _mm512_min_ps() + * _mm512_mask_max_ps() + * _mm512_mask_min_ps() + * _mm512_mask_mul_ps() + * _mm512_mask_abs_ps() + * _mm512_add_round_ps() + * _mm512_sub_round_ps() + */ + + +int show_op = 0; + +typedef enum { ASSIGN, ABS, ADD, MAX, MIN, MUL, SUB } OPER; + +static void NOINLINE intop(OPER op, float ivalout[16], float ivalop1[16], + float ivalop2[16]) { + int i; + int handled = 0; + + memset(ivalout, 0, sizeof(ivalout)); + for (i = 0; i < 16; i += 1) { + switch (op) { + case ASSIGN: + handled = 1; + ivalout[i] = ivalop1[i]; + break; + case ADD: + handled = 1; + ivalout[i] = ivalop1[i] + ivalop2[i]; + break; + case ABS: + handled = 1; + ivalout[i] = ivalop1[i] >= 0 ? ivalop1[i] : -ivalop1[i]; + break; + case MAX: + handled = 1; + ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MIN: + handled = 1; + ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MUL: + handled = 1; + ivalout[i] = ivalop2[i] * ivalop1[i]; + break; + case SUB: + handled = 1; + ivalout[i] = ivalop1[i] - ivalop2[i]; + break; + default: + printf("FAIL: bad op\n"); + break; + } + } + if (!handled) { + printf("FAIL: unsupported op\n"); + n_errs++; + } +} + +static int NOINLINE check(float val1[], float good[]) { + int i; + int res = 1; + for (i = 0; i < 16; i += 1) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + return (res); +} + +static int NOINLINE check_mask(float val1[], float good[], int mask) { + int i; + int res = 1; + for (i = 0; i < 16; i += 1) { + if ((1 << i) & mask) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + } + return (res); +} + +static void NOINLINE print_vec(char *pfx, float ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[15], ivec[14], ivec[13], + ivec[12]); + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[11], ivec[10], ivec[9], ivec[8]); + printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]); + printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]); +} + +#define DOONE(OP, FUNC) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.zmm = FUNC(v1.zmm, v2.zmm); \ + passed = check(vvv.f32, good.f32); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_MASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm, v2.zmm); \ + passed = check_mask(vvv.f32, good.f32, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_MASK_1OP(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm); \ + passed = check_mask(vvv.f32, good.f32, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_ROUND(OP, FUNC, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.zmm = FUNC(v1.zmm, v2.zmm, ROUND); \ + passed = check(vvv.f32, good.f32); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm, v2.zmm, ROUND); \ + passed = check_mask(vvv.f32, good.f32, MMASK); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +int main() { + float init1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16}; + float init2[] = {11, 12, 23, 24, -35, 36, 17, 38, + 42, -1, 33, 7, 8, 10, 11, 12}; + + V512 v1; + V512 v2; + V512 good; + V512 vvv; + + intop(ASSIGN, v1.f32, init1, 0); + intop(ASSIGN, v2.f32, init2, 0); + + // simple intrinsics + DOONE(ADD, _mm512_add_ps); + DOONE(MAX, _mm512_max_ps); + DOONE(MIN, _mm512_min_ps); + + DOONE_WITH_MASK(MAX, _mm512_mask_max_ps, 0xf01); + DOONE_WITH_MASK(MIN, _mm512_mask_min_ps, 0xf03); + DOONE_WITH_MASK(MUL, _mm512_mask_mul_ps, 0xff0); + + DOONE_WITH_MASK_1OP(ABS, _mm512_mask_abs_ps, 0xcf1); + + // intrinsics with rounding mode round + DOONE_ROUND(ADD, _mm512_add_round_ps, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(SUB, _mm512_sub_round_ps, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output +++ SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/m512_test_util.h =================================================================== --- SingleSource/UnitTests/Vector/AVX512/m512_test_util.h +++ SingleSource/UnitTests/Vector/AVX512/m512_test_util.h @@ -0,0 +1,258 @@ +#ifndef M512_TEST_UTIL_H_INCLUDED +#define M512_TEST_UTIL_H_INCLUDED + +/* + * Common declarations useful for writing 512-bit unit tests. + */ + +#include +#include +#include +#include + +#define ALIGNTO(n) __declspec(align(n)) + +/* + * For purposes of unit tests it can be beneficial to suppress inlining + * simply so that only a single instance of a test function is emitted. + * Makes it easier to diff A/B assembly output. + */ +#define NOINLINE __declspec(noinline) + +/* + * FULL_IREG(ax) expands to either eax or rax depending on the target. + */ +#if defined(__x86_64) || defined(_M_X64) +#define FULL_IREG(reg) r##reg +#else +#define FULL_IREG(reg) e##reg +#endif + +/* Number of elements in an array. */ +#define ASIZE(a) (sizeof((a)) / sizeof((a)[0])) + +typedef __int64 I64; +typedef unsigned __int64 U64; + +typedef union ALIGNTO(64) { + + __m512 zmm; + __m512d zmmd; + __m512i zmmi; + + __m256 ymm[2]; + __m256d ymmd[2]; + __m256i ymmi[2]; + + __m128 xmm[4]; + __m128d xmmd[4]; + __m128i xmmi[4]; + + char c[64]; + signed char s8[64]; + unsigned char u8[64]; + short s16[32]; + unsigned short u16[32]; + int s32[16]; + unsigned int u32[16]; + float f32[16]; + I64 s64[8]; + U64 u64[8]; + double f64[8]; + +} V512; + +int n_errs = 0; + +/* + * Print the low N 32-bit unsigned integers from p. + */ + +void NOINLINE display_pd(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.8x", p->u32[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N 64-bit unsigned integers from p. + */ +void NOINLINE display_pq(const V512 *p, const char *banner, int n_elems) { + int i = 7; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %0.16llx", p->u64[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N single precision floats from p. + */ + +void NOINLINE display_psf(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %7g", p->f32[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Print the low N double precision floats from p. + */ + +void NOINLINE display_pdf(const V512 *p, const char *banner, int n_elems) { + int i = 15; + + if (banner) { + printf("%s", banner); + } + + for (i = n_elems; i >= 0; i--) { + printf(" %7g", p->f64[i]); + if (i > 0 && i % 4 == 0) { + printf("\n"); + if (banner) { + printf("%*s", (int)strlen((void *)banner), ""); + } + } + } + printf("\n"); +} + +/* + * Check that the low N 32-bit elements of "got" and "expected" are the same. + */ +int NOINLINE check_equal_nd(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u32[i] != v2->u32[i]) { + printf("ERROR(%d): %s failed at %d'th element: 0x%0.8x != 0x%0.8x\n", + line, banner ? banner : "", i, v1->u32[i], v2->u32[i]); + display_pd(got, "got:", n_elems); + display_pd(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +/* + * Check that the low N 64-bit elements of "got" and "expected" are the same. + */ +int NOINLINE check_equal_nq(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (v1->u64[i] != v2->u64[i]) { + printf( + "ERROR(%d): %s failed at %d'th element: 0x%0.16llx != 0x%0.16llx\n", + line, banner ? banner : "", i, v1->u64[i], v2->u64[i]); + display_pq(got, "got:", n_elems); + display_pq(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +double delta = 1e-4; + +#define EQUAL_FP(v1, v2) \ + ((v1) < (v2) ? ((v2) - (v1) < delta) : ((v1) - (v2) < delta)) + +/* + * Check that the low N single precision float elements of "got" and "expected" + * are the same. + */ +int NOINLINE check_equal_nsf(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (!EQUAL_FP(v1->f32[i], v2->f32[i])) { + printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, + banner ? banner : "", i, v1->f32[i], v2->f32[i]); + display_psf(got, "got:", n_elems); + display_psf(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +/* + * Check that the low N double precision float elements of "got" and "expected" + * are the same. + */ +int NOINLINE check_equal_ndf(void *got, void *expected, int n_elems, + char *banner, int line) { + int i, fail = 0; + V512 *v1 = (V512 *)got; + V512 *v2 = (V512 *)expected; + + for (i = 0; i < n_elems; i++) { + if (!EQUAL_FP(v1->f64[i], v2->f64[i])) { + printf("ERROR(%d): %s failed at %d'th element: %7g != %7g \n", line, + banner ? banner : "", i, v1->f64[i], v2->f64[i]); + display_pdf(got, "got:", n_elems); + display_pdf(expected, "exp:", n_elems); + n_errs++; + fail = 1; + break; + } + } + return fail; +} + +#endif /* M512_TEST_UTIL_H_INCLUDED */ Index: SingleSource/UnitTests/Vector/AVX512/maskz.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/maskz.c +++ SingleSource/UnitTests/Vector/AVX512/maskz.c @@ -0,0 +1,99 @@ + +#include "m512_test_util.h" +#include +#include +/* + * Here we check for _mm512_maskz_[add|sub]_[round]_ps intrinsics. + */ +volatile int vol0 = 0; + +V512 i32; +V512 i32_squares; +V512 i64; +V512 i64_squares; +V512 f32; +V512 f32_squares; +V512 f32_halves; +V512 f64; +V512 f64_squares; +V512 f64_halves; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_squares.s32[i] = i * i; + f32.f32[i] = i; + f32_squares.f32[i] = i * i; + f32_halves.f32[i] = i + 0.5f; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_squares.s64[i] = i * i; + f64.f64[i] = i; + f64_squares.f64[i] = i * i; + f64_halves.f64[i] = i + 0.5; + } +} + +/* + * Generate function do_"oper"_ps, which tests + * _mm512_maskz_oper_ps(__mmask16, __m512, __m512) and + * _mm512_maskz_oper_round_ps(__mmask16, __m512, __m512, int rounding) + */ + +#define GEN_PS2_OROUND(oper) \ + void NOINLINE do_##oper##_ps() { \ + V512 resm, resz; \ + __mmask16 k; \ + \ + k = 0xbcdf; \ + resm.zmm = _mm512_setzero_ps(); \ + resm.zmm = \ + _mm512_mask_##oper##_ps(resm.zmm, k, f32_halves.zmm, f32_squares.zmm); \ + \ + /* Set resz to all 1's, use vol0 to make it stick. */ \ + resz.zmmi = _mm512_ternarylogic_epi32(i32.zmmi, i32.zmmi, i32.zmmi, 0xff); \ + resz.xmm[vol0] = resz.xmm[vol0]; /* No-op. */ \ + resz.zmm = _mm512_maskz_##oper##_ps(k, f32_halves.zmm, f32_squares.zmm); \ + check_equal_nd(&resz, &resm, 16, "_mm512_maskz_" #oper "_ps", __LINE__); \ + \ + /* Now with a rounding override. */ \ + \ + f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */ \ + resm.zmm = _mm512_setzero_ps(); \ + resm.zmm = _mm512_mask_##oper##_round_ps( \ + resm.zmm, k, f32_halves.zmm, f32_squares.zmm, \ + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); \ + f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */ \ + \ + /* Set resz to all 1's, use vol0 to make it stick. */ \ + resz.zmmi = _mm512_ternarylogic_epi32(i32.zmmi, i32.zmmi, i32.zmmi, 0xff); \ + resz.xmm[vol0] = resz.xmm[vol0]; /* No-op. */ \ + \ + resz.zmm = _mm512_maskz_##oper##_round_ps( \ + k, f32_halves.zmm, f32_squares.zmm, \ + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); \ + check_equal_nd(&resz, &resm, 16, "_mm512_maskz_" #oper "_round_ps", \ + __LINE__); \ + } + +GEN_PS2_OROUND(sub) +GEN_PS2_OROUND(add) + +int main(int argc, char *argv[]) { + init(); + + do_add_ps(); + do_sub_ps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/maskz.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/maskz.reference_output +++ SingleSource/UnitTests/Vector/AVX512/maskz.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/math.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/math.c +++ SingleSource/UnitTests/Vector/AVX512/math.c @@ -0,0 +1,696 @@ +/* + * Test math instructions: sqrt, reciprocal, floor, ceil, exponent, + * scale, fixup ,roundscale and ternary logic. + * Here we check for _mm512_[mask|maskz]_[ceil|floor|scalef|sqrt|ternarylogic] + * intrinsics. + */ +#include "m512_test_util.h" +#include +#include +#include + +volatile int vol0 = 0; + +V512 i32; +V512 i32_squares; +V512 i32_neg; +V512 i64; +V512 i64_squares; +V512 i64_neg; +V512 f32; +V512 f32_squares; +V512 f32_halves; +V512 f64; +V512 f64_squares; +V512 f64_halves; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_squares.s32[i] = i * i; + i32_neg.s32[i] = -i; + f32.f32[i] = i; + f32_squares.f32[i] = i * i; + f32_halves.f32[i] = i + 0.5f; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_squares.s64[i] = i * i; + i64_neg.s64[i] = -i; + f64.f64[i] = i; + f64_squares.f64[i] = i * i; + f64_halves.f64[i] = i + 0.5; + } +} + +void NOINLINE do_rcp14pd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0xc3; + + res.zmmd = _mm512_rcp14_pd(f64.zmmd); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_rcp14_pd(res.zmmd, k, f64.zmmd); +} + +void NOINLINE do_rcp14ps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x7e95; + + res.zmm = _mm512_rcp14_ps(f32.zmm); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_rcp14_ps(res.zmm, k, f32.zmm); +} + +void NOINLINE do_sqrtps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_sqrt_ps(f32_squares.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_sqrt_ps", __LINE__); + + f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_sqrt_ps(res.zmm, k, f32_squares.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_ps", __LINE__); +} + +void NOINLINE do_sqrtpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_sqrt_pd(f64_squares.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_sqrt_pd", __LINE__); + + f64_squares.xmmd[vol0] = f64_squares.xmmd[vol0]; /* No-op. */ + + k = 0xe9; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_sqrt_pd(res.zmmd, k, f64_squares.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_pd", __LINE__); +} + +void NOINLINE do_floorps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_floor_ps(f32_halves.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_floor_ps", __LINE__); + + f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_floor_ps(res.zmm, k, f32_halves.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_ps", __LINE__); +} + +void NOINLINE do_floorpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_floor_pd(f64_halves.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_floor_pd", __LINE__); + + f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */ + + k = 0x7b; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_floor_pd(res.zmmd, k, f64_halves.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_pd", __LINE__); +} + +void NOINLINE do_ceilps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_ceil_ps(f32_halves.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i + 1; + } + check_equal_nd(&res, &expected, 16, "_mm512_ceil_ps", __LINE__); + + f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_ceil_ps(res.zmm, k, f32_halves.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i + 1; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_ps", __LINE__); +} + +void NOINLINE do_ceilpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_ceil_pd(f64_halves.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i + 1; + } + check_equal_nd(&res, &expected, 16, "_mm512_ceil_pd", __LINE__); + + f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */ + + k = 0x7b; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_ceil_pd(res.zmmd, k, f64_halves.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i + 1; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_pd", __LINE__); +} + +void NOINLINE do_getexpsd() { + __mmask8 k8 = 0x2; + volatile __m128d res; + volatile __m128d v1 = _mm_set_pd(8.0, 32.0); + volatile __m128d v2 = _mm_set_pd(16.0, 64.0); + + __m128d res_exp_nomask = _mm_set_pd(8.0, 6.0); + __m128d res_exp_mask = _mm_set_pd(8.0, 32.0); + __m128d res_exp_maskz = _mm_set_pd(8.0, 0.0); + + res = _mm_setzero_pd(); + res = _mm_getexp_sd(v1, v2); + + check_equal_ndf((void *)&res, (void *)&res_exp_nomask, 2, "_mm_getexp_sd", + __LINE__); + + res = _mm_setzero_pd(); + res = _mm_mask_getexp_sd(v1, k8, v1, v2); + check_equal_ndf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_sd", + __LINE__); + + res = _mm_setzero_pd(); + res = _mm_maskz_getexp_sd(k8, v1, v2); + check_equal_ndf((void *)&res, (void *)&res_exp_maskz, 2, + "_mm_maskz_getexp_sd", __LINE__); +} + +void NOINLINE do_getexpss() { + __mmask8 k8 = 0xe; + volatile __m128 res; + volatile __m128 v1 = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f); + volatile __m128 v2 = _mm_set_ps(128.0f, 256.0f, 512.0f, 1024.0f); + + volatile __m128 res_exp_nomask = _mm_set_ps(16.0f, 32.0f, 64.0f, 10.0f); + volatile __m128 res_exp_mask = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f); + volatile __m128 res_exp_maskz = _mm_set_ps(16.0f, 32.0f, 64.0f, 0.0f); + + res = _mm_setzero_ps(); + res = _mm_getexp_ss(v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_nomask, 4, "_mm_getexp_ss", + __LINE__); + + res = _mm_setzero_ps(); + res = _mm_mask_getexp_ss(v1, k8, v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_ss", + __LINE__); + + res = _mm_setzero_ps(); + res = _mm_maskz_getexp_ss(k8, v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_maskz, 4, + "_mm_maskz_getexp_ss", __LINE__); +} + +void NOINLINE do_getmantpd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.zmmd = + _mm512_getmant_pd(f64.zmmd, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_getmant_pd(res.zmmd, k, f64.zmmd, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); +} + +void NOINLINE do_getmantps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x7e95; + + res.zmm = _mm512_getmant_ps(f32.zmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_getmant_ps(res.zmm, k, f32.zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); +} + +#define CHECK_SCALEFPD(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.f64[i] = f64.f64[i] * (pow(2.0, floor(f64_squares.f64[i]))); \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0.0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + f64.xmmd[vol0] = f64.xmmd[vol0]; \ + } + +void NOINLINE do_scalefpd() { + V512 res; + V512 expected; + __mmask8 k = 0xFF; + + res.zmmd = _mm512_scalef_round_pd(f64.zmmd, f64_squares.zmmd, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_round_pd"); + + res.zmmd = _mm512_scalef_pd(f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_pd"); + + k = 0x75; + + res.zmmd = _mm512_mask_scalef_round_pd( + f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_round_pd"); + + res.zmmd = + _mm512_mask_scalef_pd(f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_pd"); + + k = 0x57; + + res.zmmd = _mm512_maskz_scalef_round_pd( + k, f64.zmmd, f64_squares.zmmd, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_round_pd"); + + res.zmmd = _mm512_maskz_scalef_pd(k, f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_pd"); +} + +#define CHECK_SCALEFPS(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.f32[i] = f32.f32[i] * (powf(2.0F, floorf(f32_squares.f32[i]))); \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0.0F; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + f32.xmm[vol0] = f32.xmm[vol0]; \ + } + +void NOINLINE do_scalefps() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.zmm = _mm512_scalef_round_ps(f32.zmm, f32_squares.zmm, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_round_ps"); + + res.zmm = _mm512_scalef_ps(f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_ps"); + + k = 0x0bcd; + + res.zmm = + _mm512_mask_scalef_round_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_round_ps"); + + res.zmm = _mm512_mask_scalef_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_ps"); + + k = 0x0dcb; + + res.zmm = _mm512_maskz_scalef_round_ps( + k, f32.zmm, f32_squares.zmm, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_round_ps"); + + res.zmm = _mm512_maskz_scalef_ps(k, f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_ps"); +} + +#define SOME_ROUND (_MM_FROUND_CUR_DIRECTION) + +void NOINLINE do_fixupimmpd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.zmmd = _mm512_fixupimm_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi, 0x97); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_fixupimm_pd(res.zmmd, k, f64.zmmd, i32.zmmi, 0xfe); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_maskz_fixupimm_pd(k, res.zmmd, f64.zmmd, i32.zmmi, 0xfe); + + res.zmmd = _mm512_fixupimm_round_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi, + 0x97, SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_fixupimm_round_pd(res.zmmd, k, f64.zmmd, i32.zmmi, + 0xfe, SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_maskz_fixupimm_round_pd(k, res.zmmd, f64.zmmd, i32.zmmi, + 0xfe, SOME_ROUND); +} + +void NOINLINE do_fixupimmps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x75; + + res.zmm = _mm512_fixupimm_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_fixupimm_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_maskz_fixupimm_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe); + + res.zmm = _mm512_fixupimm_round_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97, + SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_fixupimm_round_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe, + SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_maskz_fixupimm_round_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe, + SOME_ROUND); +} + +void NOINLINE do_fixupimmsd() { + volatile V512 res; + V512 expected; + + __mmask8 k = 0x75; + + res.xmmd[0] = + _mm_fixupimm_sd(f64.xmmd[0], f64_squares.xmmd[0], i32.xmmi[0], 0x97); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = + _mm_mask_fixupimm_sd(res.xmmd[0], k, f64.xmmd[0], i32.xmmi[0], 0xfe); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = + _mm_maskz_fixupimm_sd(k, res.xmmd[0], f64.xmmd[0], i32.xmmi[0], 0xfe); + + res.xmmd[0] = _mm_fixupimm_round_sd(f64.xmmd[0], f64_squares.xmmd[0], + i32.xmmi[0], 0x97, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = _mm_mask_fixupimm_round_sd(res.xmmd[0], k, f64.xmmd[0], + i32.xmmi[0], 0xfe, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = _mm_maskz_fixupimm_round_sd(k, res.xmmd[0], f64.xmmd[0], + i32.xmmi[0], 0xfe, SOME_ROUND); +} + +void NOINLINE do_fixupimmss() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.xmm[0] = + _mm_fixupimm_ss(f32.xmm[0], f32_squares.xmm[0], i32.xmmi[0], 0x97); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = + _mm_mask_fixupimm_ss(res.xmm[0], k, f32.xmm[0], i32.xmmi[0], 0xfe); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = + _mm_maskz_fixupimm_ss(k, res.xmm[0], f32.xmm[0], i32.xmmi[0], 0xfe); + + res.xmm[0] = _mm_fixupimm_round_ss(f64.xmm[0], f64_squares.xmm[0], + i32.xmmi[0], 0x97, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = _mm_mask_fixupimm_round_ss(res.xmm[0], k, f64.xmm[0], + i32.xmmi[0], 0xfe, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = _mm_maskz_fixupimm_round_ss(k, res.xmm[0], f64.xmm[0], + i32.xmmi[0], 0xfe, SOME_ROUND); +} + +void NOINLINE do_roundscalepd() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x3d; + + res.zmmd = _mm512_roundscale_pd(f64.zmmd, 0xff); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_roundscale_pd(res.zmmd, k, f64.zmmd, 0x36); +} + +void NOINLINE do_roundscaleps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x74cb; + + res.zmm = _mm512_roundscale_ps(f32.zmm, 0xf7); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_roundscale_ps(res.zmm, k, f32.zmm, 0x36); +} + +static int NOINLINE emulate_ternarylogicd(int a, int b, int c, int imm) { + int i, index, res = 0; + + for (i = 0; i < 32; i++) { + index = ((a & 1) << 2) | ((b & 1) << 1) | (c & 1); + res |= ((imm & (1 << index)) ? 1 : 0) << i; + a >>= 1; + b >>= 1; + c >>= 1; + } + + return res; +} + +void NOINLINE do_pternlogq() { + volatile int i; + V512 res, resx, resy; + V512 expected; + __mmask8 k8 = 0x75; + + res.zmmi = + _mm512_ternarylogic_epi64(i64.zmmi, i64_squares.zmmi, i64.zmmi, 0x79); + for (i = 0; i < 16; i++) { + expected.s32[i] = + emulate_ternarylogicd(i64.s32[i], i64_squares.s32[i], i64.s32[i], 0x79); + } + check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi64", __LINE__); + + i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_ternarylogic_epi64(res.zmmi, k8, i64_neg.zmmi, + i64.zmmi, 0xca); + for (i = 0; i < 16; i += 2) { + if (k8 & (1 << (i / 2))) { + expected.s32[i] = + emulate_ternarylogicd(0, i64_neg.s32[i], i64.s32[i], 0xca); + expected.s32[i + 1] = + emulate_ternarylogicd(0, i64_neg.s32[i + 1], i64.s32[i + 1], 0xca); + } else { + expected.s32[i] = 0; + expected.s32[i + 1] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi64", + __LINE__); + + i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_maskz_ternarylogic_epi64(k8, i64_squares.zmmi, + i64_squares.zmmi, i64.zmmi, 0x3b); + for (i = 0; i < 16; i += 2) { + if (k8 & (1 << (i / 2))) { + expected.s32[i] = emulate_ternarylogicd( + i64_squares.s32[i], i64_squares.s32[i], i64.s32[i], 0x3b); + expected.s32[i + 1] = emulate_ternarylogicd( + i64_squares.s32[i + 1], i64_squares.s32[i + 1], i64.s32[i], 0x3b); + } else { + expected.s32[i] = 0; + expected.s32[i + 1] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi64", + __LINE__); +} + +void NOINLINE do_pternlogd() { + volatile int i; + V512 res, resx, resy; + V512 expected; + __mmask16 k = 0x23bc; + __mmask8 k8 = (__mmask8)k; + + res.zmmi = + _mm512_ternarylogic_epi32(i32.zmmi, i32_squares.zmmi, i32.zmmi, 0x97); + for (i = 0; i < 16; i++) { + expected.s32[i] = + emulate_ternarylogicd(i32.s32[i], i32_squares.s32[i], i32.s32[i], 0x97); + } + check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi32", __LINE__); + + i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_ternarylogic_epi32(res.zmmi, k, i32_squares.zmmi, + i32.zmmi, 0xfe); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[i] = + emulate_ternarylogicd(0, i32_squares.s32[i], i32.s32[i], 0xfe); + } else { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi32", + __LINE__); + + i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */ + + k = 0xabcd; + k8 = (__mmask8)k; + res.zmmi = _mm512_maskz_ternarylogic_epi32(k, i32_squares.zmmi, i32_neg.zmmi, + i32.zmmi, 0x3b); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[i] = emulate_ternarylogicd(i32_squares.s32[i], + i32_neg.s32[i], i32.s32[i], 0x3b); + } else { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi32", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_rcp14pd(); + do_rcp14ps(); + + do_sqrtps(); + do_sqrtpd(); + + do_floorps(); + do_floorpd(); + + do_ceilps(); + do_ceilpd(); + + do_getexpsd(); + do_getexpss(); + + do_getmantpd(); + do_getmantps(); + + do_scalefpd(); + do_scalefps(); + + do_fixupimmpd(); + do_fixupimmps(); + + do_fixupimmsd(); + do_fixupimmss(); + + do_roundscalepd(); + do_roundscaleps(); + + do_pternlogq(); + do_pternlogd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/math.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/math.reference_output +++ SingleSource/UnitTests/Vector/AVX512/math.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/minmax_int64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/minmax_int64.c +++ SingleSource/UnitTests/Vector/AVX512/minmax_int64.c @@ -0,0 +1,135 @@ +/* + * Test the min/max int64 family of intrinsics. + * Here we check for _mm512_[mask|maskz]_[min|max]_ep[i|u]64 intrinsics. + */ + +#include "m512_test_util.h" +#include + +V512 i64, i64_2; + +void NOINLINE init() { + volatile int i; + for (i = 0; i < 8; i++) { + if (i % 2) { + i64.s64[i] = (i + 1) * 10000; + i64_2.s64[i] = -(i + 1) * 1000; + } else { + i64.s64[i] = -(i + 1) * 1000; + i64_2.s64[i] = (i + 1) * 10000; + } + } +} + +void NOINLINE do_512_max_epi64() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; /* 11011011 */ + + /* scalar calculation */ + for (i = 0; i < 8; i++) { + expected.s64[i] = (i64.s64[i] > i64_2.s64[i]) ? i64.s64[i] : i64_2.s64[i]; + } + /* intrinsic calculation */ + res.zmmi = _mm512_max_epi64(i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_max_epi64", __LINE__); + + /* scalar mask */ + expected.s64[2] = 0; + expected.s64[5] = 0; + + /* masked intrinsic calculation */ + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_max_epi64(res.zmmi, k, i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_mask_max_epi64", __LINE__); +} + +void NOINLINE do_512_max_epu64() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; /* 11011011 */ + + /* scalar calculation */ + for (i = 0; i < 8; i++) { + expected.u64[i] = (i64.u64[i] > i64_2.u64[i]) ? i64.u64[i] : i64_2.u64[i]; + } + /* intrinsic calculation */ + res.zmmi = _mm512_max_epu64(i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_max_epu64", __LINE__); + + /* scalar mask */ + expected.u64[2] = 0; + expected.u64[5] = 0; + + /* masked intrinsic calculation */ + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_max_epu64(res.zmmi, k, i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_mask_max_epu64", __LINE__); +} + +void NOINLINE do_512_min_epi64() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; /* 11011011 */ + + /* scalar calculation */ + for (i = 0; i < 8; i++) { + expected.s64[i] = (i64.s64[i] < i64_2.s64[i]) ? i64.s64[i] : i64_2.s64[i]; + } + /* intrinsic calculation */ + res.zmmi = _mm512_min_epi64(i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_min_epi64", __LINE__); + + /* scalar mask */ + expected.s64[2] = 0; + expected.s64[5] = 0; + + /* masked intrinsic calculation */ + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_min_epi64(res.zmmi, k, i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_mask_min_epi64", __LINE__); +} + +void NOINLINE do_512_min_epu64() { + V512 res; + V512 expected; + volatile int i; + __mmask8 k = 0xdb; /* 11011011 */ + + /* scalar calculation */ + for (i = 0; i < 8; i++) { + expected.u64[i] = (i64.u64[i] < i64_2.u64[i]) ? i64.u64[i] : i64_2.u64[i]; + } + /* intrinsic calculation */ + res.zmmi = _mm512_min_epu64(i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_min_epu64", __LINE__); + + /* scalar mask */ + expected.u64[2] = 0; + expected.u64[5] = 0; + + /* masked intrinsic calculation */ + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_min_epu64(res.zmmi, k, i64.zmmi, i64_2.zmmi); + check_equal_nq(&res, &expected, 8, "_mm512_mask_min_epu64", __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_512_max_epi64(); + do_512_max_epu64(); + do_512_min_epi64(); + do_512_min_epu64(); + + if (n_errs) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output +++ SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/minmax_shift.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/minmax_shift.c +++ SingleSource/UnitTests/Vector/AVX512/minmax_shift.c @@ -0,0 +1,1157 @@ +/* + * Test min, max and shift instructions + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_[mask|maskz]_[abs|min|max] + * _mm512_[mask|maskz]_[sll|slli|srai|srli|sra|srl]_* + * _mm512_[mask|maskz]_cvtep*_ep* + * _mm256_cvtepi16_epi64 + * _mm256_sll + * _mm_cvtepi16_epi64 + * _mm_sll + */ +#include "m512_test_util.h" +#include +#include + +V512 i8; +V512 i8_mix; +V512 i8_big; +V512 i16; +V512 i16_mix; +V512 i16_big; +V512 i32; +V512 i32_mix; +V512 i32_big; +V512 i64; +V512 i64_mix; +V512 i64_big; +volatile int vol0 = 0; +/* + * Use this between tests to make compiler think src was updated. + * Prevents PRE'ing of a load of src, thus allowing ciscization. + */ +#define soft_update(src) (src).xmmi[vol0] = (src).xmmi[vol0] + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8.s8[i] = i; + i8_mix.s8[i] = (i & 1) ? i : -i; + i8_big.s8[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i8_big.s8[i] = -i8_big.s8[i]; + } + } + + for (i = 0; i < 32; i++) { + i16.s16[i] = i; + i16_mix.s16[i] = (i & 1) ? i : -i; + i16_big.s16[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i16_big.s16[i] = -i16_big.s16[i]; + } + } + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_mix.s32[i] = (i & 1) ? i : -i; + i32_big.s32[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i32_big.s32[i] = -i32_big.s32[i]; + } + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_mix.s64[i] = (i & 1) ? i : -i; + i64_big.s64[i] = 1000 * (i + 1); + if ((i & 1) != 0) { + i64_big.s64[i] = -i64_big.s64[i]; + } + } +} + +void NOINLINE do_absd() { + V512 res; + V512 expected; + __mmask16 k; + + res.zmmi = _mm512_abs_epi32(i32_mix.zmmi); + check_equal_nd(&res, &i32, 16, "_mm512_abs_epi32", __LINE__); + + k = 0x1234; + res.zmmi = _mm512_mask_abs_epi32(_mm512_setzero_epi32(), k, i32_mix.zmmi); + expected.zmmi = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), k, i32.zmmi); + check_equal_nd(&res, &expected, 16, "_mm512_mask_abs_epi32", __LINE__); +} + +void NOINLINE do_absq() { + V512 res; + V512 expected; + __mmask8 k; + + res.zmmi = _mm512_abs_epi64(i64_mix.zmmi); + check_equal_nd(&res, &i64, 16, "_mm512_abs_epi64", __LINE__); + + k = 0x73; + res.zmmi = _mm512_mask_abs_epi64(_mm512_setzero_epi32(), k, i64_mix.zmmi); + expected.zmmi = _mm512_mask_mov_epi64(_mm512_setzero_epi32(), k, i64.zmmi); + check_equal_nd(&res, &expected, 16, "_mm512_mask_abs_epi64", __LINE__); +} + +void NOINLINE do_movsxwq() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask8 k8 = 0xe7; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepi16_epi64(i16_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.s64[i] = i16_mix.s16[i]; + } + expected_save = expected; + soft_update(i16_mix); + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi16_epi64", __LINE__); + check_equal_nd(&yres, &expected, 8, "_mm256_cvtepi16_epi64", __LINE__); + check_equal_nd(&xres, &expected, 4, "_mm_cvtepi16_epi64", __LINE__); + + /* Masked. */ + + soft_update(i16_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepi16_epi64(zres.zmmi, k8, i16_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + soft_update(i16_mix); + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi16_epi64", __LINE__); + + /* Zero-masked. */ + + zres = i8_mix; + yres = zres; + xres = zres; + soft_update(i16_mix); + zres.zmmi = _mm512_maskz_cvtepi16_epi64(k8, i16_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + soft_update(i16_mix); + soft_update(i16_mix); + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi16_epi64", __LINE__); +} + +void NOINLINE do_movsxdq() { + V512 xres, yres, zres; + V512 expected; + __mmask8 k8 = 0x5d; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepi32_epi64(i32_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi64(7, -6, 5, -4, 3, -2, 1, 0); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi32_epi64", __LINE__); + + /* Masked. */ + + soft_update(i32_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepi32_epi64(zres.zmmi, k8, i32_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi64(0, -6, 0, -4, 3, -2, 0, 0); + soft_update(i32_mix); + soft_update(i32_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi32_epi64", __LINE__); + + /* Zero-masked. */ + + k8 = 0x79; + soft_update(i32_mix); + zres = i8_mix; + yres = zres; + xres = zres; + zres.zmmi = _mm512_maskz_cvtepi32_epi64(k8, i32_mix.ymmi[0]); + soft_update(i32_mix); + soft_update(i32_mix); + expected.zmmi = _mm512_set_epi64(7, -6, 5, -4, 3, -2, 1, 0); + expected.zmmi = _mm512_maskz_mov_epi64(k8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi32_epi64", __LINE__); +} + +void NOINLINE do_movsxbd() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask16 k16 = 0xfefe; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepi8_epi32(i8_mix.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.s32[i] = i8_mix.s8[i]; + } + expected_save = expected; + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi8_epi32", __LINE__); + soft_update(i8_mix); + + /* Masked. */ + + soft_update(i8_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepi8_epi32(zres.zmmi, k16, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 16; i++) { + if ((k16 & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi8_epi32", __LINE__); + soft_update(i8_mix); + + /* Zero-masked. */ + + soft_update(i8_mix); + k16 <<= 1; + k8 = (__mmask8)k16; + zres.zmmi = _mm512_maskz_cvtepi8_epi32(k16, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 16; i++) { + if ((k16 & (1 << i)) == 0) { + expected.s32[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi8_epi32", __LINE__); + soft_update(i8_mix); +} + +void NOINLINE do_movsxbq() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask8 k8 = 0xfe; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepi8_epi64(i8_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.s64[i] = i8_mix.s8[i]; + } + expected_save = expected; + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi8_epi64", __LINE__); + soft_update(i8_mix); + + /* Masked. */ + + soft_update(i8_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepi8_epi64(zres.zmmi, k8, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi8_epi64", __LINE__); + soft_update(i8_mix); + + /* Zero-masked. */ + + soft_update(i8_mix); + k8 <<= 1; + zres.zmmi = _mm512_maskz_cvtepi8_epi64(k8, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.s64[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi8_epi64", __LINE__); + soft_update(i8_mix); +} + +void NOINLINE do_movzxwd() { + V512 xres, yres, zres; + V512 expected; + __mmask16 k16 = 0xc936; + __mmask8 k8 = (__mmask8)k16; + ; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepu16_epi32(i16_mix.ymmi[0]); + expected.zmmi = + _mm512_set_epi32(15, 0xfff2, 13, 0xfff4, 11, 0xfff6, 9, 0xfff8, 7, 0xfffa, + 5, 0xfffc, 3, 0xfffe, 1, 0); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu16_epi32", __LINE__); + + /* Masked. */ + + soft_update(i16_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepu16_epi32(zres.zmmi, k16, i16_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi32(15, 0xfff2, 0, 0, 11, 0, 0, 0xfff8, 0, 0, 5, + 0xfffc, 0, 0xfffe, 1, 0); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu16_epi32", __LINE__); + + /* Zero-masked. */ + + zres = i8_mix; + yres = zres; + xres = zres; + soft_update(i16_mix); + zres.zmmi = _mm512_maskz_cvtepu16_epi32(k16, i16_mix.ymmi[0]); + expected.zmmi = + _mm512_set_epi32(15, 0xfff2, 13, 0xfff4, 11, 0xfff6, 9, 0xfff8, 7, 0xfffa, + 5, 0xfffc, 3, 0xfffe, 1, 0); + expected.zmmi = _mm512_maskz_mov_epi32(k16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu16_epi32", __LINE__); +} + +void NOINLINE do_movzxwq() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask8 k8 = 0xe7; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepu16_epi64(i16_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = i16_mix.u16[i]; + } + expected_save = expected; + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu16_epi64", __LINE__); + + /* Masked. */ + + soft_update(i16_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepu16_epi64(zres.zmmi, k8, i16_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.u64[i] = 0; + } + } + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu16_epi64", __LINE__); + + /* Zero-masked. */ + + zres = i8_mix; + yres = zres; + xres = zres; + soft_update(i16_mix); + zres.zmmi = _mm512_maskz_cvtepu16_epi64(k8, i16_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.u64[i] = 0; + } + } + soft_update(i16_mix); + soft_update(i16_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu16_epi64", __LINE__); +} + +void NOINLINE do_movzxdq() { + V512 xres, yres, zres; + V512 expected; + __mmask8 k8 = 0xeb; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepu32_epi64(i32_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, (unsigned int)-4, 3, + (unsigned int)-2, 1, 0); + soft_update(i32_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu32_epi64", __LINE__); + + /* Masked. */ + + soft_update(i32_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepu32_epi64(zres.zmmi, k8, i32_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi64(0, -6, 0, -4, 3, -2, 0, 0); + expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, 0, 3, 0, 1, 0); + soft_update(i32_mix); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu32_epi64", __LINE__); + + /* Zero-masked. */ + + k8 = 0xe7; + soft_update(i32_mix); + zres = i8_mix; + yres = zres; + xres = zres; + zres.zmmi = _mm512_maskz_cvtepu32_epi64(k8, i32_mix.ymmi[0]); + soft_update(i32_mix); + expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, (unsigned int)-4, 3, + (unsigned int)-2, 1, 0); + expected.zmmi = _mm512_maskz_mov_epi64(k8, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu32_epi64", __LINE__); +} + +void NOINLINE do_movzxbd() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask16 k16 = 0xfefe; + __mmask8 k8 = (__mmask8)k16; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepu8_epi32(i8_mix.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = i8_mix.u8[i]; + } + expected_save = expected; + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu8_epi32", __LINE__); + soft_update(i8_mix); + + /* Masked. */ + + soft_update(i8_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepu8_epi32(zres.zmmi, k16, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 16; i++) { + if ((k16 & (1 << i)) == 0) { + expected.u32[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu8_epi32", __LINE__); + soft_update(i8_mix); + + /* Zero-masked. */ + + soft_update(i8_mix); + k16 <<= 1; + k8 = (__mmask8)k16; + zres.zmmi = _mm512_maskz_cvtepu8_epi32(k16, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 16; i++) { + if ((k16 & (1 << i)) == 0) { + expected.u32[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu8_epi32", __LINE__); + soft_update(i8_mix); +} + +void NOINLINE do_movzxbq() { + V512 xres, yres, zres; + V512 expected, expected_save; + volatile int i; + __mmask8 k8 = 0xfe; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepu8_epi64(i8_mix.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = i8_mix.u8[i]; + } + expected_save = expected; + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu8_epi64", __LINE__); + soft_update(i8_mix); + + /* Masked. */ + + soft_update(i8_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepu8_epi64(zres.zmmi, k8, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.u64[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu8_epi64", __LINE__); + soft_update(i8_mix); + + /* Zero-masked. */ + + soft_update(i8_mix); + k8 <<= 1; + zres.zmmi = _mm512_maskz_cvtepu8_epi64(k8, i8_mix.xmmi[0]); + expected = expected_save; + for (i = 0; i < 8; i++) { + if ((k8 & (1 << i)) == 0) { + expected.u64[i] = 0; + } + } + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu8_epi64", __LINE__); + soft_update(i8_mix); +} + +void NOINLINE do_maxsd() { + V512 res; + V512 expected; + __mmask8 k = 0x5d; + + res.zmmi = _mm512_max_epi32(i32.zmmi, i32_mix.zmmi); + check_equal_nd(&res, &i32, 16, "_mm512_max_epi32", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_max_epi32(res.zmmi, k, i32.zmmi, i32_mix.zmmi); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32.zmm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_max_epi32", __LINE__); +} + +void NOINLINE do_maxud() { + V512 res; + V512 expected; + __mmask16 k = 0x5d; + + res.zmmi = _mm512_max_epu32(i32.zmmi, i32_mix.zmmi); + check_equal_nd(&res, &i32_mix, 16, "_mm512_max_epu32", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_max_epu32(res.zmmi, k, i32.zmmi, i32_mix.zmmi); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32_mix.zmm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_max_epu32", __LINE__); +} + +void NOINLINE do_minsd() { + V512 res; + V512 expected; + __mmask8 k = 0x5d; + + res.zmmi = _mm512_min_epi32(i32.zmmi, i32_mix.zmmi); + check_equal_nd(&res, &i32_mix, 16, "_mm512_min_epi32", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_min_epi32(res.zmmi, k, i32.zmmi, i32_mix.zmmi); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32_mix.zmm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_min_epi32", __LINE__); +} + +void NOINLINE do_minud() { + V512 res; + V512 expected; + __mmask16 k = 0x5d; + + res.zmmi = _mm512_min_epu32(i32.zmmi, i32_mix.zmmi); + check_equal_nd(&res, &i32, 16, "_mm512_min_epu32", __LINE__); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_min_epu32(res.zmmi, k, i32.zmmi, i32_mix.zmmi); + expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32.zmm); + check_equal_nd(&res, &expected, 16, "_mm512_mask_min_epu32", __LINE__); +} + +void NOINLINE do_pslld() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask16 k; + __mmask8 k8; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_sll_epi32(i32_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = (count > 31) ? 0 : (i32_big.u32[i] << count); + } + check_equal_nd(&res, &expected, 16, "_mm512_sll_epi32", __LINE__); + + soft_update(vcount); + res.ymmi[0] = _mm256_sll_epi32(i32_big.ymmi[0], vcount.xmmi[0]); + check_equal_nd(&res, &expected, 8, "_mm256_sll_epi32", __LINE__); + + soft_update(vcount); + res.xmmi[0] = _mm_sll_epi32(i32_big.xmmi[0], vcount.xmmi[0]); + check_equal_nd(&res, &expected, 4, "_mm_sll_epi32", __LINE__); + + k = 0x7fdb; + k8 = (__mmask8)k; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_sll_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (count > 31) ? 0 : (i32_mix.u32[i] << count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sll_epi32", __LINE__); + + soft_update(vcount); + res.ymmi[0] = _mm256_setzero_si256(); + + soft_update(vcount); + res.xmmi[0] = _mm_setzero_si128(); + + res = i8_mix; + res.zmmi = _mm512_maskz_sll_epi32(k, i32_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sll_epi32", __LINE__); + + soft_update(vcount); + res = i8_mix; + + soft_update(vcount); + res = i8_mix; +} + +void NOINLINE do_psllq() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask8 k; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_sll_epi64(i64_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = (count > 63) ? 0 : (i64_big.u64[i] << count); + } + check_equal_nd(&res, &expected, 16, "_mm512_sll_epi64", __LINE__); + + soft_update(vcount); + res.ymmi[0] = _mm256_sll_epi64(i64_big.ymmi[0], vcount.xmmi[0]); + check_equal_nd(&res, &expected, 8, "_mm256_sll_epi64", __LINE__); + + soft_update(vcount); + res.xmmi[0] = _mm_sll_epi64(i64_big.xmmi[0], vcount.xmmi[0]); + check_equal_nd(&res, &expected, 4, "_mm_sll_epi64", __LINE__); + + k = 0xc3; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_sll_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (count > 63) ? 0 : (i64_mix.u64[i] << count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sll_epi64", __LINE__); + + soft_update(vcount); + res.ymmi[0] = _mm256_setzero_si256(); + + soft_update(vcount); + res.xmmi[0] = _mm_setzero_si128(); + + res = i8_mix; + res.zmmi = _mm512_maskz_sll_epi64(k, i64_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sll_epi64", __LINE__); + + soft_update(vcount); + res = i8_mix; + + soft_update(vcount); + res = i8_mix; +} + +void NOINLINE do_pslldi(int cnt) { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmmi = _mm512_slli_epi32(i32_big.zmmi, 3); + for (i = 0; i < 16; i++) { + expected.u32[i] = i32_big.u32[i] << 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_slli_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_slli_epi32(res.zmmi, k, i32_mix.zmmi, 6); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (i32_mix.u32[i] << 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi32", __LINE__); + + res.zmmi = _mm512_slli_epi32(i32_big.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.u32[i] = i32_big.u32[i] << cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_slli_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_slli_epi32(res.zmmi, k, i32_mix.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (i32_mix.u32[i] << cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi32", __LINE__); +} + +void NOINLINE do_psllqi(int cnt) { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmi = _mm512_slli_epi64(i64_big.zmmi, 3); + for (i = 0; i < 8; i++) { + expected.u64[i] = i64_big.u64[i] << 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_slli_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_slli_epi64(res.zmmi, k, i64_mix.zmmi, 6); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (i64_mix.u64[i] << 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi64", __LINE__); + + res.zmmi = _mm512_slli_epi64(i64_big.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.u64[i] = i64_big.u64[i] << cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_slli_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_slli_epi64(res.zmmi, k, i64_mix.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (i64_mix.u64[i] << cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi64", __LINE__); +} + +void NOINLINE do_psradi(int cnt) { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmmi = _mm512_srai_epi32(i32_big.zmmi, 3); + for (i = 0; i < 16; i++) { + expected.s32[i] = i32_big.s32[i] >> 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_srai_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srai_epi32(res.zmmi, k, i32_mix.zmmi, 6); + for (i = 0; i < 16; i++) { + expected.s32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s32[i] = (i32_mix.s32[i] >> 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi32", __LINE__); + + res.zmmi = _mm512_srai_epi32(i32_big.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.s32[i] = i32_big.s32[i] >> cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_srai_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srai_epi32(res.zmmi, k, i32_mix.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.s32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s32[i] = (i32_mix.s32[i] >> cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi32", __LINE__); +} + +void NOINLINE do_psrldi(int cnt) { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmmi = _mm512_srli_epi32(i32_big.zmmi, 3); + for (i = 0; i < 16; i++) { + expected.u32[i] = i32_big.u32[i] >> 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_srli_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srli_epi32(res.zmmi, k, i32_mix.zmmi, 6); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (i32_mix.u32[i] >> 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi32", __LINE__); + + res.zmmi = _mm512_srli_epi32(i32_big.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.u32[i] = i32_big.u32[i] >> cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_srli_epi32", __LINE__); + + k = 0x9786; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srli_epi32(res.zmmi, k, i32_mix.zmmi, cnt); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (i32_mix.u32[i] >> cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi32 #2", __LINE__); +} + +void NOINLINE do_psraqi(int cnt) { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmi = _mm512_srai_epi64(i64_big.zmmi, 3); + for (i = 0; i < 8; i++) { + expected.s64[i] = i64_big.s64[i] >> 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_srai_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srai_epi64(res.zmmi, k, i64_mix.zmmi, 6); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s64[i] = (i64_mix.s64[i] >> 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi64", __LINE__); + + res.zmmi = _mm512_srai_epi64(i64_big.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.s64[i] = i64_big.s64[i] >> cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_srai_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srai_epi64(res.zmmi, k, i64_mix.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s64[i] = (i64_mix.s64[i] >> cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi64", __LINE__); +} + +void NOINLINE do_psrlqi(int cnt) { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmi = _mm512_srli_epi64(i64_big.zmmi, 3); + for (i = 0; i < 8; i++) { + expected.u64[i] = i64_big.u64[i] >> 3; + } + check_equal_nd(&res, &expected, 16, "_mm512_srli_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srli_epi64(res.zmmi, k, i64_mix.zmmi, 6); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (i64_mix.u64[i] >> 6); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi64", __LINE__); + + res.zmmi = _mm512_srli_epi64(i64_big.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.u64[i] = i64_big.u64[i] >> cnt; + } + check_equal_nd(&res, &expected, 16, "_mm512_srli_epi64", __LINE__); + + k = 0x97; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srli_epi64(res.zmmi, k, i64_mix.zmmi, cnt); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (i64_mix.u64[i] >> cnt); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi64", __LINE__); +} + +void NOINLINE do_psrad() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask16 k; + __mmask8 k8; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_sra_epi32(i32_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.s32[i] = (count > 31) ? 0 : (i32_big.s32[i] >> count); + } + check_equal_nd(&res, &expected, 16, "_mm512_sra_epi32", __LINE__); + + k = 0x7fdb; + k8 = (__mmask8)k; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_sra_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s32[i] = (count > 31) ? 0 : (i32_mix.s32[i] >> count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sra_epi32", __LINE__); + + res = i8_mix; + res.zmmi = _mm512_maskz_sra_epi32(k, i32_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sra_epi32", __LINE__); +} + +void NOINLINE do_psrld() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask16 k; + __mmask8 k8; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_srl_epi32(i32_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = (count > 31) ? 0 : (i32_big.u32[i] >> count); + } + check_equal_nd(&res, &expected, 16, "_mm512_srl_epi32", __LINE__); + + k = 0x7fdb; + k8 = (__mmask8)k; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srl_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 16; i++) { + expected.u32[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u32[i] = (count > 31) ? 0 : (i32_mix.u32[i] >> count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srl_epi32", __LINE__); + + res = i8_mix; + res.zmmi = _mm512_maskz_srl_epi32(k, i32_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srl_epi32", __LINE__); +} + +void NOINLINE do_psraq() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask8 k; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_sra_epi64(i64_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.s64[i] = (count > 63) ? 0 : (i64_big.s64[i] >> count); + } + check_equal_nd(&res, &expected, 16, "_mm512_sra_epi64", __LINE__); + + k = 0xc3; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_sra_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.s64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.s64[i] = (count > 63) ? 0 : (i64_mix.s64[i] >> count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sra_epi64", __LINE__); + + res = i8_mix; + res.zmmi = _mm512_maskz_sra_epi64(k, i64_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_sra_epi64", __LINE__); +} + +void NOINLINE do_psrlq() { + V512 res; + V512 vcount; + V512 expected; + volatile int i; + int count = 7; + __mmask8 k; + + i = 0; + vcount.zmm = _mm512_setzero_ps(); + vcount.u64[i] = count; + + res.zmmi = _mm512_srl_epi64(i64_big.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = (count > 63) ? 0 : (i64_big.u64[i] >> count); + } + check_equal_nd(&res, &expected, 16, "_mm512_srl_epi64", __LINE__); + + k = 0xc3; + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_srl_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]); + for (i = 0; i < 8; i++) { + expected.u64[i] = 0; + if ((k & (1 << i)) != 0) { + expected.u64[i] = (count > 63) ? 0 : (i64_mix.u64[i] >> count); + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_srl_epi64", __LINE__); + + res.zmmi = _mm512_maskz_srl_epi64(k, i64_mix.zmmi, vcount.xmmi[0]); + check_equal_nd(&res, &expected, 16, "_mm512_maskz_srl_epi64", __LINE__); + + soft_update(vcount); + res = i8_mix; +} + +void NOINLINE do_movsxwd() { + V512 xres, yres, zres; + V512 expected; + __mmask16 k16 = 0x7e5d; + __mmask8 k8 = (__mmask8)k16; + ; + + /* Non-masked. */ + + zres.zmmi = _mm512_cvtepi16_epi32(i16_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi32(15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 5, + -4, 3, -2, 1, 0); + check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi16_epi32", __LINE__); + + /* Masked. */ + + soft_update(i16_mix); + zres.zmmi = _mm512_setzero_epi32(); + yres = zres; + xres = zres; + zres.zmmi = _mm512_mask_cvtepi16_epi32(zres.zmmi, k16, i16_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi32(0, -14, 13, -12, 11, -10, 9, 0, 0, -6, 0, -4, + 3, -2, 0, 0); + check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi16_epi32", __LINE__); + + /* Zero-masked. */ + + zres = i8_mix; + yres = zres; + xres = zres; + soft_update(i16_mix); + zres.zmmi = _mm512_maskz_cvtepi16_epi32(k16, i16_mix.ymmi[0]); + expected.zmmi = _mm512_set_epi32(15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 5, + -4, 3, -2, 1, 0); + expected.zmmi = _mm512_maskz_mov_epi32(k16, expected.zmmi); + check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi16_epi32", __LINE__); +} + +int main(int argc, char *argv[]) { + int cnt; + + init(); + + do_absd(); + do_absq(); + + do_movsxwd(); + do_movsxdq(); + do_movsxbd(); + do_movsxbq(); + do_movzxwd(); + do_movzxwq(); + + do_movzxbd(); + do_movzxbq(); + + do_maxsd(); + do_maxud(); + do_minsd(); + do_minud(); + + do_pslld(); + do_psllq(); + + for (cnt = 0; cnt <= 8; cnt++) { + do_pslldi(cnt); + do_psradi(cnt); + do_psrldi(cnt); + + do_psllqi(cnt); + do_psraqi(cnt); + do_psrlqi(cnt); + } + + do_psrlq(); + do_psraq(); + do_psrld(); + do_psrad(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output +++ SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c +++ SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c @@ -0,0 +1,309 @@ +#include "m512_test_util.h" +#include +#include +#include + +/* + * Here we check for _mm_[mask|maskz]_[add|div|max|min|mul|sub]_[round]_sd + * intrinsics. + */ + +int show_op = +#ifdef SHOW_OP + 1 +#else + 0 +#endif + ; + +typedef enum { ASSIGN, ADD, DIV, MAX, MIN, MUL, SUB } OPER; + +static void NOINLINE intop(OPER op, double ivalout[2], double ivalop1[2], + double ivalop2[2]) { + int i; + int handled = 0; + + memset(ivalout, 0, sizeof(ivalout)); + for (i = 0; i < 2; i += 1) { + switch (op) { + case ASSIGN: + handled = 1; + ivalout[i] = ivalop1[i]; + break; + case ADD: + handled = 1; + ivalout[i] = ivalop1[i] + ivalop2[i]; + break; + case DIV: + handled = 1; + ivalout[i] = ivalop1[i] / ivalop2[i]; + break; + case MAX: + handled = 1; + ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MIN: + handled = 1; + ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MUL: + handled = 1; + ivalout[i] = ivalop2[i] * ivalop1[i]; + break; + case SUB: + handled = 1; + ivalout[i] = ivalop1[i] - ivalop2[i]; + break; + default: + printf("FAIL: bad op\n"); + break; + } + } + if (!handled) { + printf("FAIL: unsupported op\n"); + n_errs++; + } +} + +static int NOINLINE check(double val1[], double good[]) { + int i; + int res = 1; + for (i = 0; i < 2; i += 1) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + return (res); +} + +static int NOINLINE check_mask(double dest[], double val1[], double good[], + int mask, int zeroing) { + int i, j; + int res = 1; + + // elements number to check dest vector + j = 1; + + if (mask == 1) { + if (val1[0] != good[0]) { + res = 0; + printf("FAIL: %f != %f\n", val1[0], dest[0]); + } + } else if (mask == 0) { + if (zeroing == 1) { + if (val1[0] != 0) { + res = 0; + printf("FAIL: %f != %f\n", val1[0], dest[0]); + } + } else { + j = 0; + } + } + + // check other elements of dest vector + for (i = j; i < 2; i += 1) { + if (val1[i] != dest[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], dest[i]); + } + } + return (res); +} + +static void NOINLINE print_vec(char *pfx, double ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%10.4f %10.4f\n", ivec[1], ivec[0]); +} + +#define DOONE(OP, FUNC) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(v1.xmmd[0], v2.xmmd[0]); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, 0x1, 0); \ + passed = check(vvv.f64, good.f64); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_MASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(vvv.xmmd[0], MMASK, v1.xmmd[0], v2.xmmd[0]); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_ZMASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(MMASK, v1.xmmd[0], v2.xmmd[0]); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 1); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_ROUND(OP, FUNC, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(v1.xmmd[0], v2.xmmd[0], ROUND); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, 0x1, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(vvv.xmmd[0], MMASK, v1.xmmd[0], v2.xmmd[0], ROUND); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +#define DOONE_WITH_ZMASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f64, v1.f64, v2.f64); \ + vvv.xmmd[0] = FUNC(MMASK, v1.xmmd[0], v2.xmmd[0], ROUND); \ + passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 1); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f64); \ + print_vec("Opand2", v2.f64); \ + print_vec("Scalar", good.f64); \ + print_vec("Vector", vvv.f64); \ + } \ + } + +int main() { + double init1[] = {1, -2, 3, -4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16}; + double init2[] = {11, 12, 23, -24, 35, 36, 17, 38, + 42, -1, 33, 7, 8, 10, 11, 12}; + + V512 v1; + V512 v2; + V512 good; + V512 vvv; + + intop(ASSIGN, v1.f64, init1, 0); + intop(ASSIGN, v2.f64, init2, 0); + vvv.xmmd[0] = _mm_setzero_pd(); + + // simple mask intrinsics + DOONE_WITH_MASK(ADD, _mm_mask_add_sd, 0x1); + DOONE_WITH_MASK(DIV, _mm_mask_div_sd, 0x1); + DOONE_WITH_MASK(MAX, _mm_mask_max_sd, 0x1); + DOONE_WITH_MASK(MIN, _mm_mask_min_sd, 0x1); + DOONE_WITH_MASK(MUL, _mm_mask_mul_sd, 0x1); + DOONE_WITH_MASK(SUB, _mm_mask_sub_sd, 0x1); + + // intrinsics with rounding mode + DOONE_ROUND(ADD, _mm_add_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(DIV, _mm_div_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MAX, _mm_max_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MIN, _mm_min_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MUL, _mm_mul_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(SUB, _mm_sub_round_sd, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + // intrinsics with mask and rounding mode + DOONE_WITH_MASK_ROUND(ADD, _mm_mask_add_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(DIV, _mm_mask_div_round_sd, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MAX, _mm_mask_max_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MIN, _mm_mask_min_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MUL, _mm_mask_mul_round_sd, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(SUB, _mm_mask_sub_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + // intrinsics with zero masking + DOONE_WITH_ZMASK(ADD, _mm_maskz_add_sd, 0x0); + DOONE_WITH_ZMASK(DIV, _mm_maskz_div_sd, 0x1); + DOONE_WITH_ZMASK(MAX, _mm_maskz_max_sd, 0x1); + DOONE_WITH_ZMASK(MIN, _mm_maskz_min_sd, 0x1); + DOONE_WITH_ZMASK(MUL, _mm_maskz_mul_sd, 0x1); + DOONE_WITH_ZMASK(SUB, _mm_maskz_sub_sd, 0x0); + + // intrinsics with zero masking and rounding mode + DOONE_WITH_ZMASK_ROUND(ADD, _mm_maskz_add_round_sd, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(DIV, _mm_maskz_div_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MAX, _mm_maskz_max_round_sd, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MIN, _mm_maskz_min_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MUL, _mm_maskz_mul_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(SUB, _mm_maskz_sub_round_sd, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output +++ SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c +++ SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c @@ -0,0 +1,306 @@ +#include "m512_test_util.h" +#include +#include +#include + +/* + * Here we check for _mm_[mask|maskz]_[add|div|max|min|mul|sub]_[round]_ss + * intrinsics. + */ + +int show_op = +#ifdef SHOW_OP + 1 +#else + 0 +#endif + ; + +typedef enum { ASSIGN, ADD, DIV, MAX, MIN, MUL, SUB } OPER; + +static void NOINLINE intop(OPER op, float ivalout[4], float ivalop1[4], + float ivalop2[4]) { + int i; + int handled = 0; + + memset(ivalout, 0, sizeof(ivalout)); + for (i = 0; i < 4; i += 1) { + switch (op) { + case ASSIGN: + handled = 1; + ivalout[i] = ivalop1[i]; + break; + case ADD: + handled = 1; + ivalout[i] = ivalop1[i] + ivalop2[i]; + break; + case DIV: + handled = 1; + ivalout[i] = ivalop1[i] / ivalop2[i]; + break; + case MAX: + handled = 1; + ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MIN: + handled = 1; + ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i]; + break; + case MUL: + handled = 1; + ivalout[i] = ivalop2[i] * ivalop1[i]; + break; + case SUB: + handled = 1; + ivalout[i] = ivalop1[i] - ivalop2[i]; + break; + default: + printf("FAIL: bad op\n"); + break; + } + } + if (!handled) { + printf("FAIL: unsupported op\n"); + n_errs++; + } +} + +static int NOINLINE check(float val1[], float good[]) { + int i; + int res = 1; + for (i = 0; i < 4; i += 1) { + if (val1[i] != good[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], good[i]); + } + } + return (res); +} + +static int NOINLINE check_mask(float dest[], float val1[], float good[], + int mask, int zeroing) { + int i, j; + int res = 1; + + // elements number to check dest vector + j = 1; + + if (mask == 1) { + if (val1[0] != good[0]) { + res = 0; + printf("FAIL: %f != %f\n", val1[0], dest[0]); + } + } else if (mask == 0) { + if (zeroing == 1) { + if (val1[0] != 0) { + res = 0; + printf("FAIL: %f != %f\n", val1[0], dest[0]); + } + } else { + j = 0; + } + } + + // check other elements of dest vector + for (i = j; i < 4; i += 1) { + if (val1[i] != dest[i]) { + res = 0; + printf("FAIL: %f != %f\n", val1[i], dest[i]); + } + } + return (res); +} + +static void NOINLINE print_vec(char *pfx, float ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]); +} + +#define DOONE(OP, FUNC) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(v1.xmm[0], v2.xmm[0]); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, 0x1, 0); \ + passed = check(vvv.f32, good.f32); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_MASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(vvv.xmm[0], MMASK, v1.xmm[0], v2.xmm[0]); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_ZMASK(OP, FUNC, MMASK) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(MMASK, v1.xmm[0], v2.xmm[0]); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 1); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_ROUND(OP, FUNC, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(v1.xmm[0], v2.xmm[0], ROUND); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, 0x1, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(vvv.xmm[0], MMASK, v1.xmm[0], v2.xmm[0], ROUND); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 0); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +#define DOONE_WITH_ZMASK_ROUND(OP, FUNC, MMASK, ROUND) \ + { \ + int passed = 0; \ + intop(OP, good.f32, v1.f32, v2.f32); \ + vvv.xmm[0] = FUNC(MMASK, v1.xmm[0], v2.xmm[0], ROUND); \ + passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 1); \ + if (!passed) { \ + printf("FAIL " #FUNC "\n"); \ + n_errs++; \ + } \ + if (!passed || show_op) { \ + print_vec("Opand1", v1.f32); \ + print_vec("Opand2", v2.f32); \ + print_vec("Scalar", good.f32); \ + print_vec("Vector", vvv.f32); \ + } \ + } + +int main() { + float init1[] = {1, -2, 3, -4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16}; + float init2[] = {11, 12, 23, -24, 35, 36, 17, 38, + 42, -1, 33, 7, 8, 10, 11, 12}; + + V512 v1; + V512 v2; + V512 good; + V512 vvv; + + intop(ASSIGN, v1.f32, init1, 0); + intop(ASSIGN, v2.f32, init2, 0); + vvv.xmm[0] = _mm_setzero_ps(); + + // simple intrinsics + DOONE_WITH_MASK(ADD, _mm_mask_add_ss, 0x1); + DOONE_WITH_MASK(MAX, _mm_mask_max_ss, 0x1); + DOONE_WITH_MASK(MIN, _mm_mask_min_ss, 0x1); + DOONE_WITH_MASK(MUL, _mm_mask_mul_ss, 0x1); + DOONE_WITH_MASK(SUB, _mm_mask_sub_ss, 0x1); + + // intrinsics with rounding mode + DOONE_ROUND(ADD, _mm_add_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(DIV, _mm_div_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MAX, _mm_max_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MIN, _mm_min_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(MUL, _mm_mul_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_ROUND(SUB, _mm_sub_round_ss, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + DOONE_WITH_MASK_ROUND(ADD, _mm_mask_add_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(DIV, _mm_mask_div_round_ss, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MAX, _mm_mask_max_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MIN, _mm_mask_min_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(MUL, _mm_mask_mul_round_ss, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_MASK_ROUND(SUB, _mm_mask_sub_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + // intrinsics with zero mask + DOONE_WITH_ZMASK(ADD, _mm_maskz_add_ss, 0x0); + DOONE_WITH_ZMASK(DIV, _mm_maskz_div_ss, 0x1); + DOONE_WITH_ZMASK(MAX, _mm_maskz_max_ss, 0x1); + DOONE_WITH_ZMASK(MIN, _mm_maskz_min_ss, 0x1); + DOONE_WITH_ZMASK(MUL, _mm_maskz_mul_ss, 0x1); + DOONE_WITH_ZMASK(SUB, _mm_maskz_sub_ss, 0x0); + + DOONE_WITH_ZMASK_ROUND(ADD, _mm_maskz_add_round_ss, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(DIV, _mm_maskz_div_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MAX, _mm_maskz_max_round_ss, 0x0, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MIN, _mm_maskz_min_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(MUL, _mm_maskz_mul_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + DOONE_WITH_ZMASK_ROUND(SUB, _mm_maskz_sub_round_ss, 0x1, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output +++ SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c +++ SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c @@ -0,0 +1,273 @@ + +/* + * Test 128 and 256-bit two operand integer intrinsics, + * with masked and zero-masked forms, by comparing + * their output with the corresponding 512-bit intrinsic. + * Here we check for _mm512_[mask|maskz]_[and|andnot|or|xor|add|max|min|mul|sub] + * intrinsics + */ + +#include "m512_test_util.h" +#include + +V512 i8_src1; +V512 i8_src2; +V512 i16_src1; +V512 i16_src2; +V512 i32_src1; +V512 i32_src2; +V512 i64_src1; +V512 i64_src2; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 64; i++) { + i8_src1.s8[i] = i; + i8_src2.s8[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 32; i++) { + i16_src1.s16[i] = i; + i16_src2.s16[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 16; i++) { + i32_src1.s32[i] = i; + i32_src2.s32[i] = (i & 1) ? i : -i; + } + + for (i = 0; i < 8; i++) { + i64_src1.s64[i] = i; + i64_src2.s64[i] = (i & 1) ? i : -i; + } +} + +/* + * Use "soft update" between tests to make compiler think src was updated. + * Prevents PRE'ing a load of src, thus allowing ciscization. + * Also prevents PRE'ing intrinsic operations, ensuring we + * execute the intended instructions. + */ +volatile int vol0 = 0; +#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0] + +/* + * Generate a function that tests a packed int64 intrinsic + * by implementing the XMM, YMM and ZMM versions, and comparing + * the XMM and YMM results with the low part of the ZMM result. + * + * We test regular, masked and zero masked forms. + * + * Use GEN_I64_UNIFORM when the core intrinsic name is the same + * for all vector lengths, e.g. "add_epi64". Otherwise use + * GEN_I64 to list the different names, e.g. "and_si128" and "and_si256". + */ + +#define GEN_I64_UNIFORM(oper) GEN_I64(oper, oper, oper, oper, oper) + +#define GEN_I64(test_name, oper_epi64, oper_xmm, oper_ymm, oper_zmm) \ + void NOINLINE do_##test_name() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask8 k8 = 0x5a; \ + \ + /* Non-masked. */ \ + \ + soft_v512_update(i64_src2); \ + zmm_res.zmmi = _mm512_##oper_zmm(i64_src1.zmmi, i64_src2.zmmi); \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i64_src2); \ + zmm_res.zmmi = _mm512_mask_##oper_epi64(zmm_res.zmmi, k8, i64_src1.zmmi, \ + i64_src2.zmmi); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi64(1); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i64_src2); \ + zmm_res.zmmi = \ + _mm512_maskz_##oper_epi64(k8, i64_src1.zmmi, i64_src2.zmmi); \ + } + +#define GEN_I32_UNIFORM(oper) GEN_I32(oper, oper, oper, oper, oper) + +#define GEN_I32(test_name, oper_epi32, oper_xmm, oper_ymm, oper_zmm) \ + void NOINLINE do_##test_name() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask16 k16 = 0x7feb; \ + __mmask8 k8 = (__mmask8)k16; \ + \ + /* Non-masked. */ \ + \ + soft_v512_update(i32_src2); \ + zmm_res.zmmi = _mm512_##oper_zmm(i32_src1.zmmi, i32_src2.zmmi); \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i32_src2); \ + zmm_res.zmmi = _mm512_mask_##oper_epi32(zmm_res.zmmi, k16, i32_src1.zmmi, \ + i32_src2.zmmi); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi32(1.0); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i32_src2); \ + zmm_res.zmmi = \ + _mm512_maskz_##oper_epi32(k16, i32_src1.zmmi, i32_src2.zmmi); \ + } + +#define GEN_I16_UNIFORM(oper) GEN_I16(oper, oper, oper, oper, oper) + +#define GEN_I16(test_name, oper_epi16, oper_xmm, oper_ymm, oper_zmm) \ + void NOINLINE do_##test_name() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask32 k32 = 0x7febeb7f; \ + __mmask16 k16 = (__mmask16)k32; \ + __mmask8 k8 = (__mmask8)k16; \ + \ + /* Non-masked. */ \ + \ + soft_v512_update(i16_src2); \ + zmm_res.zmmi = _mm512_##oper_zmm(i16_src1.zmmi, i16_src2.zmmi); \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i16_src2); \ + zmm_res.zmmi = _mm512_mask_##oper_epi16(zmm_res.zmmi, k32, i16_src1.zmmi, \ + i16_src2.zmmi); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi32(1.0); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i16_src2); \ + zmm_res.zmmi = \ + _mm512_maskz_##oper_epi16(k32, i16_src1.zmmi, i16_src2.zmmi); \ + } + +#define GEN_I8_UNIFORM(oper) GEN_I8(oper, oper, oper, oper, oper) + +#define GEN_I8(test_name, oper_epi8, oper_xmm, oper_ymm, oper_zmm) \ + void NOINLINE do_##test_name() { \ + V512 xmm_res, ymm_res, zmm_res; \ + __mmask64 k64 = 0xa55a7febeb7f5aa5U; \ + __mmask32 k32 = (__mmask32)k64; \ + __mmask16 k16 = (__mmask16)k32; \ + \ + /* Non-masked. */ \ + \ + soft_v512_update(i8_src2); \ + zmm_res.zmmi = _mm512_##oper_zmm(i8_src1.zmmi, i8_src2.zmmi); \ + \ + /* Masked. */ \ + \ + zmm_res.zmmi = _mm512_setzero_epi32(); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i8_src2); \ + zmm_res.zmmi = _mm512_mask_##oper_epi8(zmm_res.zmmi, k64, i8_src1.zmmi, \ + i8_src2.zmmi); \ + \ + /* Zero-masked. */ \ + \ + zmm_res.zmmi = _mm512_set1_epi32(1.0); \ + ymm_res = zmm_res; \ + xmm_res = zmm_res; \ + \ + soft_v512_update(i8_src2); \ + zmm_res.zmmi = _mm512_maskz_##oper_epi8(k64, i8_src1.zmmi, i8_src2.zmmi); \ + } + +GEN_I32(and_si512, and_epi32, and_si128, and_si256, and_si512) +GEN_I32(andnot_si512, andnot_epi32, andnot_si128, andnot_si256, andnot_si512) +GEN_I32(or_si512, or_epi32, or_si128, or_si256, or_si512) +GEN_I32(xor_si512, xor_epi32, xor_si128, xor_si256, xor_si512) + +GEN_I64(and_epi64, and_epi64, and_si128, and_si256, and_epi64) +GEN_I64(andnot_epi64, andnot_epi64, andnot_si128, andnot_si256, andnot_epi64) +GEN_I64(or_epi64, or_epi64, or_si128, or_si256, or_epi64) +GEN_I64(xor_epi64, xor_epi64, xor_si128, xor_si256, xor_epi64) + +GEN_I64_UNIFORM(add_epi64) +GEN_I64_UNIFORM(max_epi64) +GEN_I64_UNIFORM(max_epu64) +GEN_I64_UNIFORM(min_epi64) +GEN_I64_UNIFORM(min_epu64) +GEN_I64_UNIFORM(mul_epi32) /* Yes, these are really I64 vector elements. */ +GEN_I64_UNIFORM(mul_epu32) /* Yes, these are really I64 vector elements. */ + +GEN_I32(and_epi32, and_epi32, and_si128, and_si256, and_epi32) +GEN_I32(andnot_epi32, andnot_epi32, andnot_si128, andnot_si256, andnot_epi32) +GEN_I32(or_epi32, or_epi32, or_si128, or_si256, or_epi32) +GEN_I32(xor_epi32, xor_epi32, xor_si128, xor_si256, xor_epi32) + +GEN_I32_UNIFORM(add_epi32) +GEN_I32_UNIFORM(max_epi32) +GEN_I32_UNIFORM(max_epu32) +GEN_I32_UNIFORM(min_epi32) +GEN_I32_UNIFORM(min_epu32) +GEN_I32_UNIFORM(sub_epi32) + +int main() { + init(); + + do_and_si512(); + do_andnot_si512(); + do_or_si512(); + do_xor_si512(); + + do_and_epi64(); + do_andnot_epi64(); + do_or_epi64(); + do_xor_epi64(); + + do_add_epi64(); + do_max_epi64(); + do_max_epu64(); + do_min_epi64(); + do_min_epu64(); + do_mul_epi32(); + do_mul_epu32(); + + do_and_epi32(); + do_andnot_epi32(); + do_or_epi32(); + do_xor_epi32(); + + do_add_epi32(); + do_max_epi32(); + do_max_epu32(); + do_min_epi32(); + do_min_epu32(); + do_sub_epi32(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output +++ SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c +++ SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c @@ -0,0 +1,83 @@ +#include "m512_test_util.h" +#include +#include +/* + * Here we check for _mm512_[mask|maskz]_[rsqrt14|rcp14] intrinsics. + */ +#define CHECK_PD(op) \ + { \ + volatile __m512d r = _mm512_##op##_pd(v1); \ + check_equal_ndf(&r, &exp, 8, "_mm512_" #op "_pd", __LINE__); \ + k8 = 0xAA; \ + r = _mm512_mask_##op##_pd(undef, k8, v1); \ + check_equal_ndf(&r, &expm, 8, "_mm512_mask_" #op "_pd{1}", __LINE__); \ + r = _mm512_maskz_##op##_pd(k8, v1); \ + check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_" #op "_pd{0}", __LINE__); \ + } + +#define DECL_PD(op, srcv, expv) \ + void NOINLINE do_##op##_pd() { \ + __mmask8 k8; \ + volatile __m512d v1 = _mm512_set1_pd((srcv)); \ + \ + volatile __m512d undef = _mm512_set1_pd(3.0); \ + __m512d exp = _mm512_set1_pd(expv); \ + __m512d expm = \ + _mm512_set_pd((expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0); \ + __m512d expzm = _mm512_set_pd((expv), 0, (expv), 0, (expv), 0, (expv), 0); \ + \ + CHECK_PD(op); \ + } + +#define TEST_PD(op) do_##op##_pd() + +// PS version starts here. + +#define CHECK_PS(op) \ + { \ + volatile __m512 r = _mm512_##op##_ps(v1); \ + check_equal_nsf(&r, &exp, 16, "_mm512_" #op "_ps", __LINE__); \ + k8 = 0xAAAA; \ + r = _mm512_mask_##op##_ps(undef, k8, v1); \ + check_equal_nsf(&r, &expm, 16, "_mm512_mask_" #op "_ps{1}", __LINE__); \ + r = _mm512_maskz_##op##_ps(k8, v1); \ + check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_" #op "_ps{0}", __LINE__); \ + } + +#define DECL_PS(op, srcv, expv) \ + void NOINLINE do_##op##_ps() { \ + __mmask16 k8; \ + volatile __m512 v1 = _mm512_set1_ps((srcv)); \ + \ + volatile __m512 undef = _mm512_set1_ps(3.0); \ + __m512 exp = _mm512_set1_ps(expv); \ + __m512 expm = \ + _mm512_set_ps((expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0, \ + (expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0); \ + __m512 expzm = _mm512_set_ps((expv), 0, (expv), 0, (expv), 0, (expv), 0, \ + (expv), 0, (expv), 0, (expv), 0, (expv), 0); \ + \ + CHECK_PS(op); \ + } + +#define TEST_PS(op) do_##op##_ps() + +DECL_PD(rsqrt14, 0.25, 2.0) +DECL_PS(rsqrt14, 0.16f, 2.5f) +DECL_PD(rcp14, 0.5, 2.0) +DECL_PS(rcp14, 0.4f, 2.5f) + +int main(int argc, char *argv[]) { + TEST_PD(rcp14); + TEST_PS(rcp14); + TEST_PD(rsqrt14); + TEST_PS(rsqrt14); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c +++ SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c @@ -0,0 +1,100 @@ +#include "m512_test_util.h" +#include +#include +/* + * Here we check for _mm_[mask|maskz]_[rsqrt14|rcp14] intrinsics. + */ +#define CHECK_SCALAR_SD(op) \ + { \ + volatile __m128d r = _mm_##op##_sd(v1, v2); \ + check_equal_ndf(&r, &exp, 2, "_mm_" #op "_sd", __LINE__); \ + k8 = 1; \ + r = _mm_mask_##op##_sd(undef, k8, v1, v2); \ + check_equal_ndf(&r, &expm1, 2, "_mm_mask_" #op "_sd{1}", __LINE__); \ + k8 = 0; \ + r = _mm_mask_##op##_sd(undef, k8, v1, v2); \ + check_equal_ndf(&r, &expm0, 2, "_mm_mask_" #op "_sd{0}", __LINE__); \ + k8 = 1; \ + r = _mm_maskz_##op##_sd(k8, v1, v2); \ + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_" #op "_sd{1}", __LINE__); \ + k8 = 0; \ + r = _mm_maskz_##op##_sd(k8, v1, v2); \ + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_" #op "_sd{0}", __LINE__); \ + } + +#define DECL_SCALAR_SD(op, src1v, src2v, expv) \ + void NOINLINE do_##op##_sd() { \ + __mmask8 k8; \ + volatile __m128d v1 = _mm_set_pd(2.0 /* upr */, (src1v) /* lwr */); \ + volatile __m128d v2 = _mm_set_pd(4.0 /* upr */, (src2v) /* lwr */); \ + \ + volatile __m128d undef = _mm_set_pd(333.0 /* upr */, 111.0 /* lwr */); \ + __m128d exp = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */); \ + __m128d expm1 = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */); \ + __m128d expm0 = _mm_set_pd(2.0 /* upr */, 111.0 /* lwr */); \ + __m128d expzm1 = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */); \ + __m128d expzm0 = _mm_set_pd(2.0 /* upr */, 0.0 /* lwr */); \ + \ + CHECK_SCALAR_SD(op); \ + } + +#define TEST_SCALAR_SD(op) do_##op##_sd() + +#define CHECK_SCALAR_SS(op) \ + { \ + volatile __m128 r = _mm_##op##_ss(v1, v2); \ + check_equal_nsf(&r, &exp, 4, "_mm_" #op "_ss", __LINE__); \ + k8 = 1; \ + r = _mm_mask_##op##_ss(undef, k8, v1, v2); \ + check_equal_nsf(&r, &expm1, 4, "_mm_mask_" #op "_ss", __LINE__); \ + k8 = 0; \ + r = _mm_mask_##op##_ss(undef, k8, v1, v2); \ + check_equal_nsf(&r, &expm0, 4, "_mm_mask_" #op "_ss", __LINE__); \ + k8 = 1; \ + r = _mm_maskz_##op##_ss(k8, v1, v2); \ + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_" #op "_ss", __LINE__); \ + k8 = 0; \ + r = _mm_maskz_##op##_ss(k8, v1, v2); \ + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_" #op "_ss", __LINE__); \ + } + +#define DECL_SCALAR_SS(op, src1v, src2v, expv) \ + void NOINLINE do_##op##_ss() { \ + __mmask8 k8; \ + volatile __m128 v1 = \ + _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (src1v) /* lwr */); \ + volatile __m128 v2 = \ + _mm_set_ps(8.0f /* upr */, 7.0f, 6.0f, (src2v) /* lwr */); \ + \ + volatile __m128 undef = \ + _mm_set_ps(777.0f /* upr */, 555.0f, 333.0f, 111.0f /* lwr */); \ + __m128 exp = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */); \ + __m128 expm1 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */); \ + __m128 expm0 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, 111.0f /* lwr */); \ + __m128 expzm1 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */); \ + __m128 expzm0 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, 0.0f /* lwr */); \ + \ + CHECK_SCALAR_SS(op); \ + } + +#define TEST_SCALAR_SS(op) do_##op##_ss() + +DECL_SCALAR_SD(rsqrt14, 17.0, 0.25, 2.0) +DECL_SCALAR_SS(rsqrt14, 17.0f, 0.16f, 2.5f) +DECL_SCALAR_SD(rcp14, 17.0, 0.5, 2.0) +DECL_SCALAR_SS(rcp14, 17.0f, 0.4f, 2.5f) + +int main(int argc, char *argv[]) { + TEST_SCALAR_SD(rcp14); + TEST_SCALAR_SS(rcp14); + TEST_SCALAR_SD(rsqrt14); + TEST_SCALAR_SS(rsqrt14); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output +++ SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/reduce.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce.c +++ SingleSource/UnitTests/Vector/AVX512/reduce.c @@ -0,0 +1,731 @@ +#include "m512_test_util.h" +#include +#include +#include +#include + +/* + * Test reduce instructions. + * Here we check for _mm512_[mask_]reduce_[add|mul|min|max|and|or] intrinsics. + */ + +typedef long long s64; +typedef unsigned long long u64; + +typedef float f32; +typedef double f64; + +typedef int s32; +typedef unsigned int u32; + +int verbose = 0; +#define VERBOSE (verbose > 1) +#define SHOW_OP (verbose > 2) + +typedef enum { + ASSIGN, + ADD, + REDUCE_ADD, + REDUCE_MUL, + REDUCE_MIN, + REDUCE_MAX, + REDUCE_GMIN, + REDUCE_GMAX, + REDUCE_OR, + REDUCE_AND +} OPER; + +__mmask16 mask_true = 0xffff; + +#define MASK(mask, n) ((mask & (0x1 << n)) != 0) + +#define IMin(i, j) (((i) <= (j)) ? (i) : (j)) +#define IMax(i, j) (((i) >= (j)) ? (i) : (j)) + +#define MULOP(a, b) (a * b) +#define ADDOP(a, b) (a + b) +#define OROP(a, b) (a | b) +#define ANDOP(a, b) (a & b) +#define GMINOP(a, b) fmin(a, b) +#define GMAXOP(a, b) fmax(a, b) + +#define DO_MASK_COPY(len, output, mask, input, def) \ + { \ + int n; \ + \ + for (n = 0; n < len; n += 1) { \ + if (MASK(mask, n)) { \ + output[n] = input[n]; \ + } else { \ + output[n] = def; \ + } \ + } \ + } + +#define DO_REDUCE_16(res, mask, input, dtype, oper, initval) \ + { \ + dtype dtype##tmp[4]; \ + V512 vtmp; \ + DO_MASK_COPY(16, vtmp.dtype, mask, input, initval); \ + \ + dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]); \ + dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]); \ + dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]); \ + dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]); \ + \ + dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[8]); \ + dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[9]); \ + dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[10]); \ + dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[11]); \ + \ + dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[12]); \ + dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[13]); \ + dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[14]); \ + dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[15]); \ + \ + dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]); \ + dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]); \ + \ + res = oper(dtype##tmp[0], dtype##tmp[2]); \ + } + +#define DO_REDUCE_8(res, mask, input, dtype, oper, initval) \ + { \ + dtype dtype##tmp[4]; \ + V512 vtmp; \ + DO_MASK_COPY(8, vtmp.dtype, mask, input, initval); \ + \ + dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]); \ + dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]); \ + dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]); \ + dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]); \ + \ + dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]); \ + dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]); \ + \ + res = oper(dtype##tmp[0], dtype##tmp[2]); \ + } + +static int NOINLINE mask_s32_reduce_op(OPER op, __mmask16 mask, + int s32op1[16]) { + int handled = 0; + int res; + + switch (op) { + + case REDUCE_ADD: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, ADDOP, 0); + break; + + case REDUCE_MUL: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, MULOP, 1); + break; + + case REDUCE_MIN: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, IMin, 0x7fffffff); + break; + + case REDUCE_MAX: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, IMax, 0x80000000); + break; + + case REDUCE_OR: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, OROP, 0); + break; + + case REDUCE_AND: + handled = 1; + DO_REDUCE_16(res, mask, s32op1, s32, ANDOP, 0xffffffff); + break; + + default: + printf("FAIL: mask_s32_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_s32_reduce_op: unsupported op\n"); + } + return (res); +} + +static int NOINLINE mask_u32_reduce_op(OPER op, __mmask16 mask, + u32 u32op1[16]) { + int handled = 0; + int res; + + switch (op) { + + case REDUCE_MIN: + handled = 1; + DO_REDUCE_16(res, mask, u32op1, u32, IMin, 0xffffffff); + break; + + case REDUCE_MAX: + handled = 1; + DO_REDUCE_16(res, mask, u32op1, u32, IMax, 0x00000000); + break; + + default: + printf("FAIL: mask_u32_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_u32_reduce_op: unsupported op\n"); + } + return (res); +} + +static void NOINLINE init_s32(int s32out[16], int s32op1[16]) { + int i = 0; + for (i = 0; i < 16; i++) { + s32out[i] = s32op1[i]; + } +} + +static void NOINLINE init_f32(float f32out[16], float f32op1[16]) { + int i = 0; + for (i = 0; i < 16; i++) { + f32out[i] = f32op1[i]; + } +} + +static float NOINLINE mask_f32_reduce_op(OPER op, __mmask16 mask, + float valop1[16]) { + int handled = 0; + float res; + union { + float f32init; + int s32init; + } init; + + switch (op) { + + case REDUCE_ADD: + handled = 1; + DO_REDUCE_16(res, mask, valop1, f32, ADDOP, 0.0); + break; + + case REDUCE_MUL: + handled = 1; + DO_REDUCE_16(res, mask, valop1, f32, MULOP, 1.0); + break; + + case REDUCE_GMIN: + handled = 1; + init.s32init = 0x7f800000; /* +inf */ + DO_REDUCE_16(res, mask, valop1, f32, GMINOP, init.f32init); + break; + + case REDUCE_GMAX: + handled = 1; + init.s32init = 0xff800000; /* -inf */ + DO_REDUCE_16(res, mask, valop1, f32, GMAXOP, init.f32init); + break; + + default: + printf("FAIL: mask_f32_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_f32_reduce_op: unsupported op\n"); + } + return (res); +} + +static void NOINLINE init_f64(double f64out[8], double f64op1[8]) { + int i = 0; + for (i = 0; i < 8; i++) { + f64out[i] = f64op1[i]; + } +} + +static double NOINLINE mask_f64_reduce_op(OPER op, __mmask16 mask, + double valop1[8]) { + int handled = 0; + double res; + union { + double f64init; + int s32init[2]; + } init; + + switch (op) { + + case REDUCE_ADD: + handled = 1; + DO_REDUCE_8(res, mask, valop1, f64, ADDOP, 0.0); + break; + + case REDUCE_MUL: + handled = 1; + DO_REDUCE_8(res, mask, valop1, f64, MULOP, 1.0); + break; + + case REDUCE_GMIN: + handled = 1; + init.s32init[0] = 0x00000000; /* +inf */ + init.s32init[1] = 0x7ff00000; /* +inf */ + DO_REDUCE_8(res, mask, valop1, f64, GMINOP, init.f64init); + break; + + case REDUCE_GMAX: + handled = 1; + init.s32init[0] = 0x00000000; /* -inf */ + init.s32init[1] = 0xfff00000; /* -inf */ + DO_REDUCE_8(res, mask, valop1, f64, GMAXOP, init.f64init); + break; + + default: + printf("FAIL: mask_f64_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_f64_reduce_op: unsupported op\n"); + } + return (res); +} + +static void NOINLINE print_s32(char *pfx, int var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%5d", var); + printf("\n"); +} + +static void NOINLINE print_u32(char *pfx, u32 var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%5u", var); + printf("\n"); +} + +static void NOINLINE print_f32(char *pfx, float var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%5.2f", var); + printf("\n"); +} + +static void NOINLINE print_f64(char *pfx, double var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%5.2lf", var); + printf("\n"); +} + +static void NOINLINE print_ivec(char *pfx, int ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + char *fmt = "%5d %5d %5d %5d "; + printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]); + printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]); + printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]); + printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]); + printf("\n"); +} + +static void NOINLINE print_uvec(char *pfx, u32 ivec[]) { + if (pfx) { + printf("%s: ", pfx); + } + char *fmt = "%5u %5u %5u %5u "; + printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]); + printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]); + printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]); + printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]); + printf("\n"); +} + +static void NOINLINE print_fvec(char *pfx, float fvec[]) { + if (pfx) { + printf("%s: ", pfx); + } + char *fmt = "%5.2f %5.2f %5.2f %5.2f "; + printf(fmt, fvec[15], fvec[14], fvec[13], fvec[12]); + printf(fmt, fvec[11], fvec[10], fvec[9], fvec[8]); + printf(fmt, fvec[7], fvec[6], fvec[5], fvec[4]); + printf(fmt, fvec[3], fvec[2], fvec[1], fvec[0]); + printf("\n"); +} + +static void NOINLINE print_dvec(char *pfx, double dvec[]) { + if (pfx) { + printf("%s: ", pfx); + } + char *fmt = "%5.2lf %5.2lf %5.2lf %5.2lf "; + printf(fmt, dvec[7], dvec[6], dvec[5], dvec[4]); + printf(fmt, dvec[3], dvec[2], dvec[1], dvec[0]); + printf("\n"); +} + +#define PRINT_MASK(bits, width, pfx, var) \ + print_mask(bits, "%" #width "d ", pfx, var) + +static void NOINLINE print_mask(int bits, char *fmt, char *pfx, + __mmask16 mask) { + int i; + if (pfx) { + printf("%s: ", pfx); + } + for (i = bits; i >= 1; i -= 1) { + printf(fmt, MASK(mask, (i - 1))); + } + printf("\n"); +} + +#define CHECK_PRINT(STATUS, FUNC) \ + if (!(STATUS)) { \ + printf("FAIL " #FUNC "\n"); \ + err += 1; \ + } else if (VERBOSE) { \ + printf("PASS " #FUNC "\n"); \ + } + +#define CHECK_REDUCE_S32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_ivec("Opand1", v1.s32); \ + print_s32("Scalar", result); \ + print_s32("Vector", mresult); \ + } \ + } + +#define CHECK_REDUCE_U32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_uvec("Opand1", v1.u32); \ + print_u32("Scalar", result); \ + print_u32("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_S32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_ivec("Opand1", v1.s32); \ + PRINT_MASK(16, 5, " Mask", mask); \ + print_s32("Scalar", result); \ + print_s32("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_U32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_uvec("Opand1", v1.u32); \ + PRINT_MASK(16, 5, " Mask", mask); \ + print_u32("Scalar", result); \ + print_u32("Vector", mresult); \ + } \ + } + +#define CHECK_REDUCE_F32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_fvec("Opand1", v1.f32); \ + print_f32("Scalar", result); \ + print_f32("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_F32(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_fvec("Opand1", v1.f32); \ + PRINT_MASK(16, 9, " Mask", mask); \ + print_f32("Scalar", result); \ + print_f32("Vector", mresult); \ + } \ + } + +#define CHECK_REDUCE_F64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_dvec("Opand1", v1.f64); \ + print_f64("Scalar", result); \ + print_f64("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_F64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_dvec("Opand1", v1.f64); \ + PRINT_MASK(8, 10, " Mask", mask); \ + print_f64("Scalar", result); \ + print_f64("Vector", mresult); \ + } \ + } + +#define DOONE_REDUCE_S32(OP, FUNC) \ + { \ + int result; \ + int mresult; \ + result = mask_s32_reduce_op(OP, mask_true, v1.s32); \ + mresult = FUNC(v1.zmmi); \ + CHECK_REDUCE_S32(FUNC); \ + } + +#define DOONE_MASK_REDUCE_S32(OP, mask, FUNC) \ + { \ + int result; \ + int mresult; \ + result = mask_s32_reduce_op(OP, mask, v1.s32); \ + mresult = FUNC(mask, v1.zmmi); \ + CHECK_MASK_REDUCE_S32(FUNC); \ + } + +#define DOONE_REDUCE_U32(OP, FUNC) \ + { \ + u32 result; \ + u32 mresult; \ + result = mask_u32_reduce_op(OP, mask_true, v1.u32); \ + mresult = FUNC(v1.zmmi); \ + CHECK_REDUCE_U32(FUNC); \ + } + +#define DOONE_MASK_REDUCE_U32(OP, mask, FUNC) \ + { \ + int result; \ + int mresult; \ + result = mask_u32_reduce_op(OP, mask, v1.u32); \ + mresult = FUNC(mask, v1.zmmi); \ + CHECK_MASK_REDUCE_U32(FUNC); \ + } + +#define DOONE_REDUCE_F32(OP, FUNC) \ + { \ + float result; \ + float mresult; \ + result = mask_f32_reduce_op(OP, mask_true, v1.f32); \ + mresult = FUNC(v1.zmm); \ + CHECK_REDUCE_F32(FUNC); \ + } + +#define DOONE_MASK_REDUCE_F32(OP, mask, FUNC) \ + { \ + float result; \ + float mresult; \ + result = mask_f32_reduce_op(OP, mask, v1.f32); \ + mresult = FUNC(mask, v1.zmm); \ + CHECK_MASK_REDUCE_F32(FUNC); \ + } + +#define DOONE_REDUCE_F64(OP, FUNC) \ + { \ + double result; \ + double mresult; \ + result = mask_f64_reduce_op(OP, mask_true, v1.f64); \ + mresult = FUNC(v1.zmmd); \ + CHECK_REDUCE_F64(FUNC); \ + } + +#define DOONE_MASK_REDUCE_F64(OP, mask, FUNC) \ + { \ + double result; \ + double mresult; \ + memset(&result, 0, sizeof(result)); \ + memset(&mresult, 0, sizeof(mresult)); \ + result = mask_f64_reduce_op(OP, mask, v1.f64); \ + mresult = FUNC(mask, v1.zmmd); \ + CHECK_MASK_REDUCE_F64(FUNC); \ + } + +__mmask16 mvals[] = {0, 0x82a5}; + +int main(int argc, char *argv[]) { + int i; + int err = 0; + int init1[16] = {7, 1, -3, 3, 1, 1, 2, 3, 1, 3, 2, 3, -5, 1, 11, 3}; + + float finit1[16] = {-1.0, -2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 18.0, + -9.0, 10.0, 11.0, -12.0, 13.0, 14.0, 15.0, 16.0}; + + double dinit1[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + double dinit2[8] = {0.5, 2.0, 3.0, 2.1, 5.0, 5.2, 7.1, 3.1}; + + V512 v1; + __mmask16 mask = 0x82a5; + + verbose = argc; + + /* zmmi/s32 tests ---------------------------------------- */ + /* _mm512_reduce_add_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_ADD, _mm512_reduce_add_epi32); + + /* _mm512_reduce_mul_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_MUL, _mm512_reduce_mul_epi32); + + /* _mm512_reduce_min_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_MIN, _mm512_reduce_min_epi32); + + /* _mm512_reduce_max_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_MAX, _mm512_reduce_max_epi32); + + /* _mm512_reduce_and_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_AND, _mm512_reduce_and_epi32); + + /* _mm512_reduce_or_epi32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_S32(REDUCE_OR, _mm512_reduce_or_epi32); + + /* _mm512_reduce_min_epu32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_U32(REDUCE_MIN, _mm512_reduce_min_epu32); + + /* _mm512_reduce_max_epu32 */ + init_s32(v1.s32, init1); + DOONE_REDUCE_U32(REDUCE_MAX, _mm512_reduce_max_epu32); + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_mask_reduce_min_epu32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_U32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epu32); + + /* _mm512_mask_reduce_max_epu32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_U32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epu32); + } + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_mask_reduce_add_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_ADD, mask, _mm512_mask_reduce_add_epi32); + + /* _mm512_mask_reduce_mul_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_epi32); + + /* _mm512_mask_reduce_min_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epi32); + + /* _mm512_mask_reduce_max_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epi32); + + /* _mm512_mask_reduce_and_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_AND, mask, _mm512_mask_reduce_and_epi32); + + /* _mm512_mask_reduce_or_epi32 */ + init_s32(v1.s32, init1); + DOONE_MASK_REDUCE_S32(REDUCE_OR, mask, _mm512_mask_reduce_or_epi32); + } + + /* zmm/f32 tests ---------------------------------------- */ + /* _mm512_reduce_add_ps */ + init_f32(v1.f32, finit1); + DOONE_REDUCE_F32(REDUCE_ADD, _mm512_reduce_add_ps); + + /* _mm512_reduce_mul_ps */ + init_f32(v1.f32, finit1); + DOONE_REDUCE_F32(REDUCE_MUL, _mm512_reduce_mul_ps); + + /* _mm512_reduce_gmin_ps */ + init_f32(v1.f32, finit1); + DOONE_REDUCE_F32(REDUCE_GMIN, _mm512_reduce_min_ps); + + /* _mm512_reduce_gmax_ps */ + init_f32(v1.f32, finit1); + DOONE_REDUCE_F32(REDUCE_GMAX, _mm512_reduce_max_ps); + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_reduce_gmax_ps */ + init_f32(v1.f32, finit1); + DOONE_MASK_REDUCE_F32(REDUCE_GMIN, mask, _mm512_mask_reduce_min_ps); + + /* _mm512_reduce_gmax_ps */ + init_f32(v1.f32, finit1); + DOONE_MASK_REDUCE_F32(REDUCE_GMAX, mask, _mm512_mask_reduce_max_ps); + + /* _mm512_reduce_mul_ps */ + init_f32(v1.f32, finit1); + DOONE_MASK_REDUCE_F32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_ps); + + /* _mm512_reduce_add_ps */ + init_f32(v1.f32, finit1); + DOONE_MASK_REDUCE_F32(REDUCE_ADD, mask, _mm512_mask_reduce_add_ps); + } + + /* zmmd/f64 tests ---------------------------------------- */ + /* _mm512_reduce_add_pd */ + init_f64(v1.f64, dinit1); + DOONE_REDUCE_F64(REDUCE_ADD, _mm512_reduce_add_pd); + + /* _mm512_reduce_mul_pd */ + init_f64(v1.f64, dinit1); + DOONE_REDUCE_F64(REDUCE_MUL, _mm512_reduce_mul_pd); + + /* _mm512_reduce_gmin_pd */ + init_f64(v1.f64, dinit1); + DOONE_REDUCE_F64(REDUCE_GMIN, _mm512_reduce_min_pd); + + /* _mm512_reduce_gmax_pd */ + init_f64(v1.f64, dinit1); + DOONE_REDUCE_F64(REDUCE_GMAX, _mm512_reduce_max_pd); + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_mask_reduce_gmin_ps */ + init_f64(v1.f64, dinit1); + DOONE_MASK_REDUCE_F64(REDUCE_GMIN, mask, _mm512_mask_reduce_min_pd); + + /* _mm512_mask_reduce_gmax_ps */ + init_f64(v1.f64, dinit2); + DOONE_MASK_REDUCE_F64(REDUCE_GMAX, mask, _mm512_mask_reduce_max_pd); + + /* _mm512_mask_reduce_mul_ps */ + init_f64(v1.f64, dinit1); + DOONE_MASK_REDUCE_F64(REDUCE_MUL, mask, _mm512_mask_reduce_mul_pd); + + /* _mm512_mask_reduce_add_ps */ + init_f64(v1.f64, dinit2); + DOONE_MASK_REDUCE_F64(REDUCE_ADD, mask, _mm512_mask_reduce_add_pd); + } + + if (err) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/reduce.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce.reference_output +++ SingleSource/UnitTests/Vector/AVX512/reduce.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c +++ SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c @@ -0,0 +1,73 @@ +#include "m512_test_util.h" +#include +#include + +/* + * Here we check for _mm512_[mask_]reduce_[add|mul] intrinsics. + */ + +#define CHECK_PD(op) \ + { \ + volatile double r = _mm512_##op##_pd(v1); \ + check_equal_ndf(&r, &exp1, 1, "_mm512_" #op "_pd", __LINE__); \ + k8 = 0xAA; \ + r = _mm512_mask_##op##_pd(k8, v1); \ + check_equal_ndf(&r, &exp2, 1, "_mm512_mask_" #op "_pd{1}", __LINE__); \ + } + +#define DECL_PD(op, srcv, out1, out2) \ + void NOINLINE do_##op##_pd() { \ + __mmask8 k8; \ + volatile __m512d v1 = _mm512_set1_pd((srcv)); \ + \ + double exp1 = (out1); \ + double exp2 = (out2); \ + \ + CHECK_PD(op); \ + } + +#define TEST_PD(op) do_##op##_pd() + +// PS version starts here. + +#define CHECK_PS(op) \ + { \ + volatile float r = _mm512_##op##_ps(v1); \ + check_equal_nsf(&r, &exp1, 1, "_mm512_" #op "_ps", __LINE__); \ + k8 = 0xAAAA; \ + r = _mm512_mask_##op##_ps(k8, v1); \ + check_equal_nsf(&r, &exp2, 1, "_mm512_mask_" #op "_ps{1}", __LINE__); \ + } + +#define DECL_PS(op, srcv, out1, out2) \ + void NOINLINE do_##op##_ps() { \ + __mmask16 k8; \ + volatile __m512 v1 = _mm512_set1_ps((srcv)); \ + \ + float exp1 = (out1); \ + float exp2 = (out2); \ + \ + CHECK_PS(op); \ + } + +#define TEST_PS(op) do_##op##_ps() + +DECL_PD(reduce_add, 0.5, 4.0, 2.0) +DECL_PS(reduce_add, 0.4f, 6.4f, 3.2f) +DECL_PD(reduce_mul, 1.1, 2.1435f, 1.4641f) +DECL_PS(reduce_mul, -1.1f, 4.5949f, 2.1435f) + +int main(int argc, char *argv[]) { + TEST_PD(reduce_add); + TEST_PS(reduce_add); + TEST_PD(reduce_mul); + TEST_PS(reduce_mul); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/reduce_int64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce_int64.c +++ SingleSource/UnitTests/Vector/AVX512/reduce_int64.c @@ -0,0 +1,398 @@ +#include "m512_test_util.h" +#include +#include +#include +#include +/* + * Here we check for _mm512_[mask_]reduce_[add|mul|min|max] intrinsics. + */ +typedef __int64 s64; +typedef unsigned __int64 u64; + +typedef float f32; +typedef double f64; + +typedef int s32; +typedef unsigned int u32; + +int verbose = 0; +#define VERBOSE (verbose > 1) +#define SHOW_OP (verbose > 2) +#define SCALE_TRACE (verbose > 3) + +typedef enum { + REDUCE_ADD, + REDUCE_MUL, + REDUCE_MIN, + REDUCE_MAX, + REDUCE_OR, + REDUCE_AND +} OPER; + +__mmask16 mask_true = 0xffff; + +#define MASK(mask, n) ((mask & (0x1 << n)) != 0) + +#define IMin(i, j) (((i) <= (j)) ? (i) : (j)) +#define IMax(i, j) (((i) >= (j)) ? (i) : (j)) + +#define MULOP(a, b) (a * b) +#define ADDOP(a, b) (a + b) +#define OROP(a, b) (a | b) +#define ANDOP(a, b) (a & b) + +#define DO_MASK_COPY(len, output, mask, input, def) \ + { \ + int n; \ + \ + for (n = 0; n < len; n += 1) { \ + if (MASK(mask, n)) { \ + output[n] = input[n]; \ + } else { \ + output[n] = def; \ + } \ + } \ + } + +#define DO_REDUCE_8(res, mask, input, dtype, oper, initval) \ + { \ + dtype dtype##tmp[4]; \ + V512 vtmp; \ + DO_MASK_COPY(8, vtmp.dtype, mask, input, initval); \ + \ + dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]); \ + dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]); \ + dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]); \ + dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]); \ + \ + dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]); \ + dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]); \ + \ + res = oper(dtype##tmp[0], dtype##tmp[2]); \ + } + +static __int64 NOINLINE mask_s64_reduce_op(OPER op, __mmask16 mask, + __int64 s64op1[8]) { + int handled = 0; + __int64 res; + + switch (op) { + + case REDUCE_ADD: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, ADDOP, 0); + break; + + case REDUCE_MUL: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, MULOP, 1); + break; + + case REDUCE_MIN: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, IMin, 0x7fffffffffffffff); + break; + + case REDUCE_MAX: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, IMax, 0x8000000000000000); + break; + + case REDUCE_OR: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, OROP, 0); + break; + + case REDUCE_AND: + handled = 1; + DO_REDUCE_8(res, mask, s64op1, s64, ANDOP, 0xffffffffffffffff); + break; + + default: + printf("FAIL: mask_s64_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_s64_reduce_op: unsupported op\n"); + } + return (res); +} + +static __int64 NOINLINE mask_u64_reduce_op(OPER op, __mmask16 mask, + unsigned __int64 u64op1[8]) { + int handled = 0; + __int64 res; + + switch (op) { + + case REDUCE_MIN: + handled = 1; + DO_REDUCE_8(res, mask, u64op1, u64, IMin, 0xffffffffffffffff); + break; + + case REDUCE_MAX: + handled = 1; + DO_REDUCE_8(res, mask, u64op1, u64, IMax, 0x0000000000000000); + break; + + default: + printf("FAIL: mask_u64_reduce_op: bad op\n"); + exit(1); + break; + } + if (!handled) { + printf("FAIL: mask_u64_reduce_op: unsupported op\n"); + } + return (res); +} + +static void NOINLINE init_s64(__int64 s64out[8], __int64 s64op1[8]) { + int i = 0; + for (i = 0; i < 8; i += 1) { + s64out[i] = s64op1[i]; + } +} + +static void NOINLINE print_s64(char *pfx, __int64 var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%15lld", var); + printf("\n"); +} + +static void NOINLINE print_u64(char *pfx, u64 var) { + if (pfx) { + printf("%s: ", pfx); + } + printf("%15llu", var); + printf("\n"); +} + +static void NOINLINE print_ivec(char *pfx, __int64 ivec[]) { + char *fmt = "%5ld %5ld %5ld %5ld "; + if (pfx) { + printf("%s: ", pfx); + } + printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]); + printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]); + printf("\n"); +} + +static void NOINLINE print_uvec(char *pfx, unsigned __int64 ivec[]) { + char *fmt = "%5lu %5lu %5lu %5lu "; + if (pfx) { + printf("%s: ", pfx); + } + printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]); + printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]); + printf("\n"); +} + +#define PRINT_MASK(bits, width, pfx, var) \ + print_mask(bits, "%" #width "d ", pfx, var) + +static void NOINLINE print_mask(int bits, char *fmt, char *pfx, + __mmask16 mask) { + int i; + if (pfx) { + printf("%s: ", pfx); + } + for (i = bits; i >= 1; i -= 1) { + printf(fmt, MASK(mask, (i - 1))); + } + printf("\n"); +} + +#define CHECK_PRINT(STATUS, FUNC) \ + if (!(STATUS)) { \ + printf("FAIL " #FUNC "\n"); \ + err += 1; \ + } else if (VERBOSE) { \ + printf("PASS " #FUNC "\n"); \ + } + +#define CHECK_REDUCE_S64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_ivec("Opand1", v1.s64); \ + print_s64("Scalar", result); \ + print_s64("Vector", mresult); \ + } \ + } + +#define CHECK_REDUCE_U64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_uvec("Opand1", v1.u64); \ + print_u64("Scalar", result); \ + print_u64("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_S64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_ivec("Opand1", v1.s64); \ + PRINT_MASK(8, 5, " Mask", mask); \ + print_s64("Scalar", result); \ + print_s64("Vector", mresult); \ + } \ + } + +#define CHECK_MASK_REDUCE_U64(FUNC) \ + { \ + int passed = (result == mresult); \ + CHECK_PRINT(passed, FUNC); \ + if (!passed || SHOW_OP) { \ + print_uvec("Opand1", v1.u64); \ + PRINT_MASK(8, 5, " Mask", mask); \ + print_u64("Scalar", result); \ + print_u64("Vector", mresult); \ + } \ + } + +#define DOONE_REDUCE_S64(OP, FUNC) \ + { \ + __int64 result; \ + __int64 mresult; \ + result = mask_s64_reduce_op(OP, mask_true, v1.s64); \ + mresult = FUNC(v1.zmmi); \ + CHECK_REDUCE_S64(FUNC); \ + } + +#define DOONE_MASK_REDUCE_S64(OP, mask, FUNC) \ + { \ + __int64 result; \ + __int64 mresult; \ + result = mask_s64_reduce_op(OP, mask, v1.s64); \ + mresult = FUNC(mask, v1.zmmi); \ + CHECK_MASK_REDUCE_S64(FUNC); \ + } + +#define DOONE_REDUCE_U64(OP, FUNC) \ + { \ + unsigned __int64 result; \ + unsigned __int64 mresult; \ + result = mask_u64_reduce_op(OP, mask_true, v1.u64); \ + mresult = FUNC(v1.zmmi); \ + CHECK_REDUCE_U64(FUNC); \ + } + +#define DOONE_MASK_REDUCE_U64(OP, mask, FUNC) \ + { \ + unsigned __int64 result; \ + unsigned __int64 mresult; \ + result = mask_u64_reduce_op(OP, mask, v1.u64); \ + mresult = FUNC(mask, v1.zmmi); \ + CHECK_MASK_REDUCE_U64(FUNC); \ + } + +__mmask16 mvals[] = {0, 0x82a5}; +__int64 init1[8] = {7, 1, 11, 3, 1, 1, 2, 3}; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 8; i++) { + init1[i] = init1[i]; /* No change, but compiler does not know this. */ + } +} + +int main(int argc, char *argv[]) { + int i; + int err = 0; + + V512 v1; + __mmask16 mask = 0; + + verbose = argc; + + init(); + + /* zmmi/s64 tests ---------------------------------------- */ + /* _mm512_reduce_add_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_ADD, _mm512_reduce_add_epi64); + + /* _mm512_reduce_mul_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_MUL, _mm512_reduce_mul_epi64); + + /* _mm512_reduce_min_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_MIN, _mm512_reduce_min_epi64); + + /* _mm512_reduce_max_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_MAX, _mm512_reduce_max_epi64); + + /* _mm512_reduce_and_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_AND, _mm512_reduce_and_epi64); + + /* _mm512_reduce_or_epi64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_S64(REDUCE_OR, _mm512_reduce_or_epi64); + + /* _mm512_reduce_min_epu64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_U64(REDUCE_MIN, _mm512_reduce_min_epu64); + + /* _mm512_reduce_max_epu64 */ + init_s64(v1.s64, init1); + DOONE_REDUCE_U64(REDUCE_MAX, _mm512_reduce_max_epu64); + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_mask_reduce_min_epu64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_U64(REDUCE_MIN, mask, _mm512_mask_reduce_min_epu64); + + /* _mm512_mask_reduce_max_epu64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_U64(REDUCE_MAX, mask, _mm512_mask_reduce_max_epu64); + } + + for (i = 0; i < 2; i += 1) { + mask = mvals[i]; + /* _mm512_mask_reduce_add_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_ADD, mask, _mm512_mask_reduce_add_epi64); + + /* _mm512_mask_reduce_mul_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_MUL, mask, _mm512_mask_reduce_mul_epi64); + + /* _mm512_mask_reduce_min_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_MIN, mask, _mm512_mask_reduce_min_epi64); + + /* _mm512_mask_reduce_max_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_MAX, mask, _mm512_mask_reduce_max_epi64); + + /* _mm512_mask_reduce_and_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_AND, mask, _mm512_mask_reduce_and_epi64); + + /* _mm512_mask_reduce_or_epi64 */ + init_s64(v1.s64, init1); + DOONE_MASK_REDUCE_S64(REDUCE_OR, mask, _mm512_mask_reduce_or_epi64); + } + + if (err) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output +++ SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c +++ SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c @@ -0,0 +1,133 @@ +#include "m512_test_util.h" +#include + +/* + * Here we check for _mm512_[mask|mmaskz]roundscale[_round]_[ss|sd] intrinsics. + */ + +void __declspec(noinline) do_roundscale_pd() { + __mmask8 k8; + volatile __m512d v1 = _mm512_set1_pd((1.6)); + volatile __m512d undef = _mm512_set1_pd(3.0); + __m512d exp = _mm512_set1_pd((2.0)); + __m512d exp1 = _mm512_set1_pd((1.0)); + __m512d expm = _mm512_set_pd((2.0), 3.0, (2.0), 3.0, (2.0), 3.0, (2.0), 3.0); + __m512d expzm = _mm512_set_pd((1.0), 0, (1.0), 0, (1.0), 0, (1.0), 0); + + { + volatile __m512d r = _mm512_roundscale_pd(v1, 0x8); + check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_pd{0x8}", __LINE__); + r = _mm512_roundscale_pd(v1, 0x9); + check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_pd{0x9}", __LINE__); + k8 = 0xAA; + r = _mm512_mask_roundscale_pd(undef, k8, v1, 0xA); + check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_pd{1}{0xA}", + __LINE__); + r = _mm512_maskz_roundscale_pd(k8, v1, 0xB); + check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_pd{0}{0xB}", + __LINE__); + } + + { + volatile __m512d r = + _mm512_roundscale_round_pd(v1, 0x8, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_round_pd{0x8}", __LINE__); + r = _mm512_roundscale_round_pd(v1, 0x9, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_round_pd{0x9}", __LINE__); + k8 = 0xAA; + r = _mm512_mask_roundscale_round_pd(undef, k8, v1, 0xA, + (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_round_pd{1}{0xA}", + __LINE__); + r = _mm512_maskz_roundscale_round_pd(k8, v1, 0xB, + (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_round_pd{0}{0xB}", + __LINE__); + } + + { + volatile __m512d r = + _mm512_roundscale_round_pd(v1, 0x8, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_round_pd{0x8}", __LINE__); + r = _mm512_roundscale_round_pd(v1, 0x9, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_round_pd{0x9}", __LINE__); + k8 = 0xAA; + r = _mm512_mask_roundscale_round_pd(undef, k8, v1, 0xA, + ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_round_pd{1}{0xA}", + __LINE__); + r = _mm512_maskz_roundscale_round_pd(k8, v1, 0xB, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_round_pd{0}{0xB}", + __LINE__); + } +} +void __declspec(noinline) do_roundscale_ps() { + __mmask16 k8; + volatile __m512 v1 = _mm512_set1_ps((-1.6f)); + volatile __m512 undef = _mm512_set1_ps(3.0); + __m512 exp = _mm512_set1_ps((-2.0f)); + __m512 exp1 = _mm512_set1_ps((-2.0f)); + __m512 expm = + _mm512_set_ps((-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0, + (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0); + __m512 expzm = _mm512_set_ps((-1.0f), 0, (-1.0f), 0, (-1.0f), 0, (-1.0f), 0, + (-1.0f), 0, (-1.0f), 0, (-1.0f), 0, (-1.0f), 0); + + { + volatile __m512 r = _mm512_roundscale_ps(v1, 0x8); + check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_{0x8}", __LINE__); + r = _mm512_roundscale_ps(v1, 0x9); + check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_{0x9}", __LINE__); + k8 = 0xAAAA; + r = _mm512_mask_roundscale_ps(undef, k8, v1, 0xA); + check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_{1}{A}", __LINE__); + r = _mm512_maskz_roundscale_ps(k8, v1, 0xB); + check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_{0}{B}", __LINE__); + } + + { + volatile __m512 r = + _mm512_roundscale_round_ps(v1, 0x8, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_round_ps", __LINE__); + r = _mm512_roundscale_round_ps(v1, 0x9, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &exp1, 16, "_mm512_roundscale_round_ps", __LINE__); + k8 = 0xAAAA; + r = _mm512_mask_roundscale_round_ps(undef, k8, v1, 0xA, + (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_round_{1}{A}", + __LINE__); + r = _mm512_maskz_roundscale_round_ps(k8, v1, 0xB, + (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_round_{0}{B}", + __LINE__); + } + + { + volatile __m512 r = + _mm512_roundscale_round_ps(v1, 0x8, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_round_ps", __LINE__); + r = _mm512_roundscale_round_ps(v1, 0x9, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &exp1, 16, "_mm512_roundscale_round_ps", __LINE__); + k8 = 0xAAAA; + r = _mm512_mask_roundscale_round_ps(undef, k8, v1, 0xA, + ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_round_{1}{A}", + __LINE__); + r = _mm512_maskz_roundscale_round_ps(k8, v1, 0xB, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_round_{0}{B}", + __LINE__); + } +} + +int main(int argc, char *argv[]) { + do_roundscale_pd(); + do_roundscale_ps(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output +++ SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c +++ SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c @@ -0,0 +1,98 @@ +#include "m512_test_util.h" +#include +#include + +/* + * Here we check for _mm_[mask|maskz]roundscale_[ss|sd] intrinsics. + */ + +void __declspec(noinline) do_roundscale_sd() { + __mmask8 k8; + volatile __m128d v1 = _mm_set_pd(2.0, (15.0)); + volatile __m128d v2 = _mm_set_pd(4.0, (1.6)); + volatile __m128d undef = _mm_set_pd(333.0, 111.0); + __m128d exp = _mm_set_pd(2.0, (2.0)); + __m128d exp1 = _mm_set_pd(2.0, (1.0)); + __m128d expm1 = _mm_set_pd(2.0, (2.0)); + __m128d expm0 = _mm_set_pd(2.0, 111.0); + __m128d expzm1 = _mm_set_pd(2.0, (1.0)); + __m128d expzm0 = _mm_set_pd(2.0, 0.0); + + { + volatile __m128d r = _mm_roundscale_sd(v1, v2, 0x8); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "roundscale_sd{imm=0x8}", + __LINE__); + r = _mm_roundscale_sd(v1, v2, 0x9); + check_equal_ndf(&r, &exp1, 2, + "_mm_" + "roundscale_sd{imm=0x9}", + __LINE__); + k8 = 1; + r = _mm_mask_roundscale_sd(undef, k8, v1, v2, 0xA); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_roundscale_sd{1}{imm=0xA}", + __LINE__); + k8 = 0; + r = _mm_mask_roundscale_sd(undef, k8, v1, v2, 0x8); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_roundscale_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_roundscale_sd(k8, v1, v2, 0xB); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_roundscale_sd{1}{imm=0xB}", + __LINE__); + k8 = 0; + r = _mm_maskz_roundscale_sd(k8, v1, v2, 0x8); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_roundscale_sd{0}", __LINE__); + } +} +void __declspec(noinline) do_roundscale_ss() { + __mmask8 k8; + volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (17.0f)); + volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (-1.6f)); + volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f); + __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (-2.0f)); + __m128 exp1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-2.0f)); + __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-1.0f)); + __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f); + __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-1.0f)); + __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f); + + { + volatile __m128 r = _mm_roundscale_ss(v1, v2, 0x8); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "roundscale_ss{imm=0x8}", + __LINE__); + r = _mm_roundscale_ss(v1, v2, 0x9); + check_equal_nsf(&r, &exp1, 4, + "_mm_" + "roundscale_ss{imm=0x9}", + __LINE__); + k8 = 1; + r = _mm_mask_roundscale_ss(undef, k8, v1, v2, 0xA); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_roundscale_ss{imm=0xA}", __LINE__); + k8 = 0; + r = _mm_mask_roundscale_ss(undef, k8, v1, v2, 0x8); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_roundscale_ss", __LINE__); + k8 = 1; + r = _mm_maskz_roundscale_ss(k8, v1, v2, 0xB); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_roundscale_ss{imm=0xB}", + __LINE__); + k8 = 0; + r = _mm_maskz_roundscale_ss(k8, v1, v2, 0x8); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_roundscale_ss", __LINE__); + } +} + +int main(int argc, char *argv[]) { + do_roundscale_sd(); + do_roundscale_ss(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output +++ SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/scalef.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/scalef.c +++ SingleSource/UnitTests/Vector/AVX512/scalef.c @@ -0,0 +1,311 @@ +#include "m512_test_util.h" +#include +#include + +/* + * Here we check for _mm_[mask|maskz]scalef_[round]_[ss|sd] intrinsics. + */ + +void __declspec(noinline) do_scalef_sd() { + __mmask8 k8; + volatile __m128d v1 = _mm_set_pd(2.0, (2.0)); + volatile __m128d v2 = _mm_set_pd(4.0, (2.5)); + volatile __m128d undef = _mm_set_pd(333.0, 111.0); + __m128d exp = _mm_set_pd(2.0, (8.0)); + __m128d expm1 = _mm_set_pd(2.0, (8.0)); + __m128d expm0 = _mm_set_pd(2.0, 111.0); + __m128d expzm1 = _mm_set_pd(2.0, (8.0)); + __m128d expzm0 = _mm_set_pd(2.0, 0.0); + + { + volatile __m128d r = _mm_scalef_sd(v1, v2); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "scalef_sd", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_sd(undef, k8, v1, v2); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_sd(undef, k8, v1, v2); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_sd(k8, v1, v2); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_sd(k8, v1, v2); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_sd{0}", __LINE__); + } + + { + volatile __m128d r = + _mm_scalef_round_sd(v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "scalef_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__); + } + + { + volatile __m128d r = _mm_scalef_round_sd(v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &exp, 2, "_mm_scalef_round_sd", __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__); + } + + { + volatile __m128d r = + _mm_scalef_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "scalef_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__); + } + + { + volatile __m128d r = _mm_scalef_round_sd( + v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "scalef_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_sd( + undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_sd( + undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__); + } + + { + volatile __m128d r = _mm_scalef_round_sd( + v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "scalef_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__); + }; +} +void __declspec(noinline) do_scalef_ss() { + __mmask8 k8; + volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (2.0f)); + volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (2.5f)); + volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f); + __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f)); + __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f)); + __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f); + __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f)); + __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f); + + { + volatile __m128 r = _mm_scalef_ss(v1, v2); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_ss(undef, k8, v1, v2); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_ss(undef, k8, v1, v2); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_ss(k8, v1, v2); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_ss(k8, v1, v2); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_ss", __LINE__); + } + + { + volatile __m128 r = _mm_scalef_round_ss(v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__); + } + + { + volatile __m128 r = _mm_scalef_round_ss(v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__); + } + + { + volatile __m128 r = + _mm_scalef_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__); + } + + { + volatile __m128 r = _mm_scalef_round_ss( + v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_ss( + undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_ss( + undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__); + } + + { + volatile __m128 r = _mm_scalef_round_ss( + v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "scalef_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__); + k8 = 1; + r = _mm_maskz_scalef_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__); + k8 = 0; + r = _mm_maskz_scalef_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__); + } +} + +int main(int argc, char *argv[]) { + do_scalef_sd(); + do_scalef_ss(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/scalef.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/scalef.reference_output +++ SingleSource/UnitTests/Vector/AVX512/scalef.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c +++ SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c @@ -0,0 +1,478 @@ +#include "m512_test_util.h" +#include +#include + +/* + * Here we check for _mm_[mask|mmaskz]sqrt_[round]_[ss|sd] intrinsics, but + * _mm_sqrt_ss,_mm_sqrt_sd intrinsics that belong to + * earlier versions of ISA. + */ + +void __declspec(noinline) do_sqrt_sd() { + __mmask8 k8; + volatile __m128d v1 = _mm_set_pd(2.0, (15.0)); + volatile __m128d v2 = _mm_set_pd(4.0, (0.25)); + volatile __m128d undef = _mm_set_pd(333.0, 111.0); + __m128d exp = _mm_set_pd(2.0, (0.5)); + __m128d expm1 = _mm_set_pd(2.0, (0.5)); + __m128d expm0 = _mm_set_pd(2.0, 111.0); + __m128d expzm1 = _mm_set_pd(2.0, (0.5)); + __m128d expzm0 = _mm_set_pd(2.0, 0.0); + volatile __m128d r; + k8 = 1; + r = _mm_mask_sqrt_sd(undef, k8, v1, v2); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_sd(undef, k8, v1, v2); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_sd(k8, v1, v2); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_sd(k8, v1, v2); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_sd{0}", + __LINE__); + + r = _mm_sqrt_round_sd(v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "sqrt" + "_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_round_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{0}", + __LINE__); + + r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "sqrt" + "_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_round_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{0}", + __LINE__); + + r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "sqrt" + "_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_round_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{0}", + __LINE__); + + r = _mm_sqrt_round_sd(v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "sqrt" + "_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_round_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_sd( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{0}", + __LINE__); + + r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &exp, 2, + "_mm_" + "sqrt" + "_round_sd", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expm1, 2, + "_mm_mask_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expm0, 2, + "_mm_mask_" + "sqrt" + "_round_sd{0}", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expzm1, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{1}", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_sd(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_ndf(&r, &expzm0, 2, + "_mm_maskz_" + "sqrt" + "_round_sd{0}", + __LINE__); +} +void __declspec(noinline) do_sqrt_ss() { + __mmask8 k8; + volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (17.0f)); + volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (0.16f)); + volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f); + __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f)); + __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f)); + __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f); + __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f)); + __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f); + volatile __m128 r; + k8 = 1; + r = _mm_mask_sqrt_ss(undef, k8, v1, v2); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_ss(undef, k8, v1, v2); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_ss(k8, v1, v2); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_ss(k8, v1, v2); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_ss", + __LINE__); + + r = _mm_sqrt_round_ss(v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION)); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + + r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC))); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + + r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO))); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + + r = _mm_sqrt_round_ss(v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_ss( + k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT))); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + + r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &exp, 4, + "_mm_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expm1, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expm0, 4, + "_mm_mask_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 1; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expzm1, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); + k8 = 0; + r = _mm_maskz_sqrt_round_ss(k8, v1, v2, + ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF))); + check_equal_nsf(&r, &expzm0, 4, + "_mm_maskz_" + "sqrt" + "_round_ss", + __LINE__); +} + +int main(int argc, char *argv[]) { + do_sqrt_sd(); + do_sqrt_ss(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output +++ SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/xor.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor.c +++ SingleSource/UnitTests/Vector/AVX512/xor.c @@ -0,0 +1,93 @@ +/* + * Test xor intrinsics. + * This test was created to check the correctness + * of the following intrinsics support: + * _mm512_xor_epi32() + * _mm512_mask_xor_epi32() + * _mm512_xor_epi64() + * _mm512_mask_xor_epi64() + */ + +#include "m512_test_util.h" +#include + +volatile int vol = 0; /* Inhibit optimization */ + +__m512 f1, f2, f3, f3_orig; +__m512d d1, d2, d3, d3_orig; +__m512i i1, i2, i3, i3_orig; + +void NOINLINE set_nonzero(void *vp, int c) { + int i; + V512 *v = (V512 *)vp; + + for (i = 0; i < 16; i++) { + v->u32[i] = 10 * i * i - 3 * i + c; + if (v->u32[i] == 0) { + v->u32[i] = 1234; + } + } +} + +void NOINLINE check_xor(void *vp1, void *vp2, void *vp3, void *vp_orig, + int mask, char *banner) { + int i; + V512 *v1 = (V512 *)vp1; + V512 *v2 = (V512 *)vp2; + V512 *v3 = (V512 *)vp3; + V512 *v_orig = (V512 *)vp_orig; + + for (i = 0; i < 16; i++) { + int actual = v3->u32[i]; + int expected = v_orig->u32[i]; + if (mask & (1 << i)) { + expected = v1->u32[i] ^ v2->u32[i]; + } + if (actual + vol != expected - vol) { + printf("ERROR: %s failed\n", banner ? banner : ""); + n_errs++; + break; + } + } +} + +void NOINLINE do_xor() { + set_nonzero(&i1, 99); + set_nonzero(&i2, 100); + set_nonzero(&f1, 33); + set_nonzero(&f2, -35); + set_nonzero(&d1, -11); + set_nonzero(&d2, 14); + + set_nonzero(&i3, 1000); + i3_orig = i3; + i3 = _mm512_xor_epi32(i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi32"); + + set_nonzero(&i3, 1500); + i3_orig = i3; + i3 = _mm512_mask_xor_epi32(i3_orig, 0x5555, i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0x5555, "_mm512_mask_xor_epi32"); + + set_nonzero(&i3, 2000); + i3_orig = i3; + i3 = _mm512_xor_epi64(i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi64"); + + set_nonzero(&i3, 2500); + i3_orig = i3; + i3 = _mm512_mask_xor_epi64(i3_orig, 0x55, i1, i2); + check_xor(&i1, &i2, &i3, &i3_orig, 0x3333, "_mm512_mask_xor_epi64"); +} + +int main() { + do_xor(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/xor.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor.reference_output +++ SingleSource/UnitTests/Vector/AVX512/xor.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c @@ -0,0 +1,70 @@ +#include "m512_test_util.h" +#include +#include +/* This test was created to check the correctness + * of the following intrinsics support: + * _mm512_or_epi32() + * _mm512_mask_or_epi32() + * _mm512_xor_epi32() + * _mm512_mask_xor_epi32() + */ +void __declspec(noinline) + check_equal_epi32(__m512i vres, __m512i vexp, char *banner, int line) { + int i; + + __declspec(align(64)) int res[16]; + __declspec(align(64)) int exp[16]; + + _mm512_store_epi32(res, vres); + _mm512_store_epi32(exp, vexp); + + for (i = 0; i < 16; i++) { + if (res[i] != exp[i]) { + printf("ERROR: %s failed at line %d with result (%d) != " + "(%d) element %d\n", + banner, line, res[i], exp[i], i); + ++n_errs; + } + } +} + +void __declspec(noinline) do_or_() { + __mmask16 k8 = 0xAAAA; + volatile __m512i undef = _mm512_set1_epi32(3); + volatile __m512i v1 = _mm512_set1_epi32((10)); + volatile __m512i v2 = _mm512_set1_epi32((3)); + volatile __m512i exp1 = _mm512_set1_epi32((11)); + volatile __m512i exp2 = _mm512_set_epi32((11), 3, (11), 3, (11), 3, (11), 3, + (11), 3, (11), 3, (11), 3, (11), 3); + volatile __m512i r = _mm512_or_epi32(v1, v2); + check_equal_nd(&r, &exp1, 16, "_mm512_or_epi32", __LINE__); + r = _mm512_mask_or_epi32(undef, k8, v1, v2); + check_equal_nd(&r, &exp2, 16, "_mm512_mask_or_epi32", __LINE__); +} + +void __declspec(noinline) do_xor_() { + __mmask16 k8 = 0xAAAA; + volatile __m512i undef = _mm512_set1_epi32(3); + volatile __m512i v1 = _mm512_set1_epi32((10)); + volatile __m512i v2 = _mm512_set1_epi32((3)); + volatile __m512i exp1 = _mm512_set1_epi32((9)); + volatile __m512i exp2 = _mm512_set_epi32((9), 3, (9), 3, (9), 3, (9), 3, (9), + 3, (9), 3, (9), 3, (9), 3); + volatile __m512i r = _mm512_xor_epi32(v1, v2); + check_equal_nd(&r, &exp1, 16, "_mm512_xor_epi32", __LINE__); + r = _mm512_mask_xor_epi32(undef, k8, v1, v2); + check_equal_nd(&r, &exp2, 16, "_mm512_mask_xor_epi32", __LINE__); +} + +int main(int argc, char *argv[]) { + do_or_(); + do_xor_(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c @@ -0,0 +1,48 @@ +#include "m512_test_util.h" +#include +#include +/* This test was created to check the correctness + * of the following intrinsics support: + * _mm512_or_epi64() + * _mm512_mask_or_epi64() + * _mm512_xor_epi64() + * _mm512_mask_xor_epi64() + */ +void __declspec(noinline) do_or_() { + __mmask16 k8 = 0xAAAA; + volatile __m512i undef = _mm512_set1_epi64(3); + volatile __m512i v1 = _mm512_set1_epi64((10)); + volatile __m512i v2 = _mm512_set1_epi64((3)); + volatile __m512i exp1 = _mm512_set1_epi64((11)); + volatile __m512i exp2 = _mm512_set_epi64((11), 3, (11), 3, (11), 3, (11), 3); + volatile __m512i r = _mm512_or_epi64(v1, v2); + check_equal_nq(&r, &exp1, 8, "_mm512_or_epi64", __LINE__); + r = _mm512_mask_or_epi64(undef, k8, v1, v2); + check_equal_nq(&r, &exp2, 8, "_mm512_mask_or_epi64", __LINE__); +} + +void __declspec(noinline) do_xor_() { + __mmask16 k8 = 0xAAAA; + volatile __m512i undef = _mm512_set1_epi64(3); + volatile __m512i v1 = _mm512_set1_epi64((10)); + volatile __m512i v2 = _mm512_set1_epi64((3)); + volatile __m512i exp1 = _mm512_set1_epi64((9)); + volatile __m512i exp2 = _mm512_set_epi64((9), 3, (9), 3, (9), 3, (9), 3); + volatile __m512i r = _mm512_xor_epi64(v1, v2); + check_equal_nq(&r, &exp1, 8, "_mm512_xor_epi64", __LINE__); + r = _mm512_mask_xor_epi64(undef, k8, v1, v2); + check_equal_nq(&r, &exp2, 8, "_mm512_mask_xor_epi64", __LINE__); +} + +int main(int argc, char *argv[]) { + do_or_(); + do_xor_(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c @@ -0,0 +1,36 @@ +#include "m512_test_util.h" +#include +#include +/* This test was created to check the correctness + * of the following intrinsics support: + * _mm512_or_si512() + * _mm512_xor_si512() + */ +void __declspec(noinline) do_or_() { + volatile __m512i v1 = _mm512_set1_epi32((10)); + volatile __m512i v2 = _mm512_set1_epi32((3)); + volatile __m512i exp1 = _mm512_set1_epi32((11)); + volatile __m512i r = _mm512_or_si512(v1, v2); + check_equal_nd(&r, &exp1, 16, "_mm512_or_si512", __LINE__); +} + +void __declspec(noinline) do_xor_() { + volatile __m512i v1 = _mm512_set1_epi32((10)); + volatile __m512i v2 = _mm512_set1_epi32((3)); + volatile __m512i exp1 = _mm512_set1_epi32((9)); + volatile __m512i r = _mm512_xor_si512(v1, v2); + check_equal_nd(&r, &exp1, 16, "_mm512_xor_si512", __LINE__); +} + +int main(int argc, char *argv[]) { + do_or_(); + do_xor_(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} Index: SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output =================================================================== --- SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output +++ SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output @@ -0,0 +1,2 @@ +PASSED +exit 0 Index: SingleSource/UnitTests/Vector/CMakeLists.txt =================================================================== --- SingleSource/UnitTests/Vector/CMakeLists.txt +++ SingleSource/UnitTests/Vector/CMakeLists.txt @@ -12,4 +12,9 @@ add_subdirectory(AArch64) endif() +if(CMAKE_C_COMPILER_ID STREQUAL "Clang") + if(ARCH STREQUAL "x86" AND X86CPU_ARCH STREQUAL "skylake-avx512") + add_subdirectory(AVX512) + endif() +endif() llvm_singlesource(PREFIX "Vector-") Index: SingleSource/UnitTests/Vector/Makefile =================================================================== --- SingleSource/UnitTests/Vector/Makefile +++ SingleSource/UnitTests/Vector/Makefile @@ -16,6 +16,12 @@ DIRS += SSE endif +ifeq ($(CC_UNDER_TEST_IS_CLANG), 1) +ifeq ($(HAVE_X86_AVX512F_INSTRUCTIONS), 1) +DIRS += AVX512 +endif +endif + # Assume ARMv7 implies NEON. ifneq ($(CC_UNDER_TEST_TARGET_IS_THUMBV7),) DIRS += NEON