Index: include/clang/Basic/BuiltinsX86.def =================================================================== --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -1488,8 +1488,10 @@ TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2d*V2dUc","","avx512vl") +TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V8dV8d*V8dUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4d*V4dUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4f*V4fUc","","avx512vl") +TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V16fV16f*V16fUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8f*V8fUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2LLiV2LLi*V2LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4LLiV4LLi*V4LLiUc","","avx512vl") @@ -1506,8 +1508,10 @@ TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs","","avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi","","avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc","","avx512vl") +TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV8d*V8dUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc","","avx512vl") +TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV16f*V16fUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2LLi*V2LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4LLi*V4LLiUc","","avx512vl") @@ -1856,8 +1860,6 @@ TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi","","avx512f") -TARGET_BUILTIN(__builtin_ia32_movss_mask, "V4fV4fV4fV4fUc","","avx512f") -TARGET_BUILTIN(__builtin_ia32_movsd_mask, "V2dV2dV2dV2dUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8LLi*V8LLiUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs","","avx512f") Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -7371,6 +7371,10 @@ case X86::BI__builtin_ia32_storeups512_mask: return EmitX86MaskedStore(*this, Ops, 1); + case X86::BI__builtin_ia32_storess128_mask: + case X86::BI__builtin_ia32_storesd128_mask: { + return EmitX86MaskedStore(*this, Ops, 16); + } case X86::BI__builtin_ia32_movdqa32store128_mask: case X86::BI__builtin_ia32_movdqa64store128_mask: case X86::BI__builtin_ia32_storeaps128_mask: @@ -7407,6 +7411,10 @@ case X86::BI__builtin_ia32_loaddqudi512_mask: return EmitX86MaskedLoad(*this, Ops, 1); + case X86::BI__builtin_ia32_loadss128_mask: + case X86::BI__builtin_ia32_loadsd128_mask: + return EmitX86MaskedLoad(*this, Ops, 16); + case X86::BI__builtin_ia32_loadaps128_mask: case X86::BI__builtin_ia32_loadaps256_mask: case X86::BI__builtin_ia32_loadaps512_mask: Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -516,6 +516,18 @@ return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_int2mask(int __a) +{ + return (__mmask16)__a; +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask2int(__mmask16 __a) +{ + return (int)__a; +} + /* Bitwise operators */ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_and_epi32(__m512i __a, __m512i __b) @@ -9107,35 +9119,96 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_movss_mask ((__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_movss_mask ((__v4sf) __A, (__v4sf) __B, - (__v4sf) - _mm_setzero_si128(), - (__mmask8) __U); + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_movsd_mask ((__v2df) __A, (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_movsd_mask ((__v2df) __A, (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess128_mask ((__v16sf *)__W, + (__v16sf) _mm512_castps128_ps512(__A), + (__mmask16) __U & (__mmask16)1); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd128_mask ((__v8df *)__W, + (__v8df) _mm512_castpd128_pd512(__A), + (__mmask8) __U & 1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) +{ + __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, + (__v4sf) {0.0, 0.0, 0.0, 0.0}, + 0, 4, 4, 4); + + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_castps128_ps512(src), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_maskz_load_ss (__mmask8 __U, const float* __A) +{ + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_setzero_ps(), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) +{ + __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, + (__v2df) {0.0, 0.0}, 0, 2); + + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_castpd128_pd512(src), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_maskz_load_sd (__mmask8 __U, const double* __A) +{ + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_setzero_pd(), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); } #define _mm512_shuffle_epi32(A, I) __extension__ ({ \ Index: test/CodeGen/avx512f-builtins.c =================================================================== --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -O2 -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=O2 #include @@ -7979,34 +7980,143 @@ return _mm512_setzero_pd(); } +__mmask16 test_mm512_int2mask(int __a) +{ + // O2-LABEL: test_mm512_int2mask + // O2: trunc i32 %__a to i16 + return _mm512_int2mask(__a); +} + +int test_mm512_mask2int(__mmask16 __a) +{ + // O2-LABEL: test_mm512_mask2int + // O2: zext i16 %__a to i32 + return _mm512_mask2int(__a); +} + __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - // CHECK-LABEL: @test_mm_mask_move_ss - // CHECK: @llvm.x86.avx512.mask.move.ss + // O2-LABEL: @test_mm_mask_move_ss + // O2: %[[M:.*]] = and i8 %__U, 1 + // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0 + // O2: %[[ELM1:.*]] = extractelement <4 x float> %__B, i32 0 + // O2: %[[ELM2:.*]] = extractelement <4 x float> %__W, i32 0 + // O2: %[[SEL:.*]] = select i1 %[[M2]], float %[[ELM1]], float %[[ELM2]] + // O2: %[[RES:.*]] = insertelement <4 x float> %__A, float %[[SEL]], i32 0 + // O2: ret <4 x float> %[[RES]] return _mm_mask_move_ss ( __W, __U, __A, __B); } __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { - // CHECK-LABEL: @test_mm_maskz_move_ss - // CHECK: @llvm.x86.avx512.mask.move.ss + // O2-LABEL: @test_mm_maskz_move_ss + // O2: %[[M:.*]] = and i8 %__U, 1 + // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0 + // O2: %[[ELM1:.*]] = extractelement <4 x float> %__B, i32 0 + // O2: %[[SEL:.*]] = select i1 %[[M2]], float %[[ELM1]], float 0.0 + // O2: %[[RES:.*]] = insertelement <4 x float> %__A, float %[[SEL]], i32 0 + // O2: ret <4 x float> %[[RES]] return _mm_maskz_move_ss (__U, __A, __B); } -__m128d test_mm_mask_move_sd (__m128 __W, __mmask8 __U, __m128d __A, __m128d __B) +__m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - // CHECK-LABEL: @test_mm_mask_move_sd - // CHECK: @llvm.x86.avx512.mask.move.sd + // O2-LABEL: @test_mm_mask_move_sd + // O2: %[[M:.*]] = and i8 %__U, 1 + // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0 + // O2: %[[ELM1:.*]] = extractelement <2 x double> %__B, i32 0 + // O2: %[[ELM2:.*]] = extractelement <2 x double> %__W, i32 0 + // O2: %[[SEL:.*]] = select i1 %[[M2]], double %[[ELM1]], double %[[ELM2]] + // O2: %[[RES:.*]] = insertelement <2 x double> %__A, double %[[SEL]], i32 0 + // O2: ret <2 x double> %[[RES]] return _mm_mask_move_sd ( __W, __U, __A, __B); } __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { - // CHECK-LABEL: @test_mm_maskz_move_sd - // CHECK: @llvm.x86.avx512.mask.move.sd + // O2-LABEL: @test_mm_maskz_move_sd + // O2: %[[M:.*]] = and i8 %__U, 1 + // O2: %[[M2:.*]] = icmp ne i8 %[[M]], 0 + // O2: %[[ELM1:.*]] = extractelement <2 x double> %__B, i32 0 + // O2: %[[SEL:.*]] = select i1 %[[M2]], double %[[ELM1]], double 0.0 + // O2: %[[RES:.*]] = insertelement <2 x double> %__A, double %[[SEL]], i32 0 + // O2: ret <2 x double> %[[RES]] return _mm_maskz_move_sd (__U, __A, __B); } +void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A) +{ + // O2-LABEL: @test_mm_mask_store_ss + // O2: %[[CAST:.*]] = bitcast float* %__P to <16 x float>* + // O2: %[[SHUFFLE:.*]] = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> + // O2: %[[MASK1:.*]] = and i8 %__U, 1 + // O2: %[[MASK2:.*]] = zext i8 %[[MASK1]] to i16 + // O2: %[[MASK3:.*]] = bitcast i16 %[[MASK2]] to <16 x i1> + // O2: tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %[[SHUFFLE]], <16 x float>* %[[CAST]], i32 16, <16 x i1> %[[MASK3]]) + _mm_mask_store_ss(__P, __U, __A); +} + +void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A) +{ + // O2-LABEL: @test_mm_mask_store_sd + // O2: %[[CAST:.*]] = bitcast double* %__P to <8 x double>* + // O2: %[[SHUFFLE:.*]] = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> + // O2: %[[MASK1:.*]] = and i8 %__U, 1 + // O2: %[[MASK2:.*]] = bitcast i8 %[[MASK1]] to <8 x i1> + // O2: tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %[[SHUFFLE]], <8 x double>* %[[CAST]], i32 16, <8 x i1> %[[MASK2]]) + _mm_mask_store_sd(__P, __U, __A); +} + +__m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W) +{ + // O2-LABEL: @test_mm_mask_load_ss + // O2: %[[SHUF:.*]] = shufflevector <4 x float> %__A, <4 x float> , <4 x i32> + // O2: %[[PTR:.*]] = bitcast float* %__W to <16 x float>* + // O2: %[[SHUF2:.*]] = shufflevector <4 x float> %[[SHUF]], <4 x float> undef, <16 x i32> + // O2: %[[AND:.*]] = and i8 %__U, 1 + // O2: %[[MASK:.*]] = zext i8 %[[AND]] to i16 + // O2: %[[MASK2:.*]] = bitcast i16 %[[MASK]] to <16 x i1> + // O2: %[[RES:.*]] = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %[[PTR]], i32 16, <16 x i1> %[[MASK2]], <16 x float> %[[SHUF2]]) + // O2: shufflevector <16 x float> %[[RES]], <16 x float> undef, <4 x i32> + return _mm_mask_load_ss(__A, __U, __W); +} + +__m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W) +{ + // O2-LABEL: @test_mm_maskz_load_ss + // O2: %[[PTR:.*]] = bitcast float* %__W to <16 x float>* + // O2: %[[AND:.*]] = and i8 %__U, 1 + // O2: %[[MASK:.*]] = zext i8 %[[AND]] to i16 + // O2: %[[MASK2:.*]] = bitcast i16 %[[MASK]] to <16 x i1> + // O2: %[[RES:.*]] = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %[[PTR]], i32 16, <16 x i1> %[[MASK2]], <16 x float> zeroinitializer) + // O2: shufflevector <16 x float> %[[RES]], <16 x float> undef, <4 x i32> + return _mm_maskz_load_ss (__U, __W); +} + +__m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W) +{ + // O2-LABEL: @test_mm_mask_load_sd + // O2: %[[SHUF:.*]] = insertelement <2 x double> %__A, double 0.000000e+00, i32 1 + // O2: %[[PTR:.*]] = bitcast double* %__W to <8 x double>* + // O2: %[[SHUF2:.*]] = shufflevector <2 x double> %[[SHUF]], <2 x double> undef, <8 x i32> + // O2: %[[AND:.*]] = and i8 %__U, 1 + // O2: %[[MASK:.*]] = bitcast i8 %[[AND]] to <8 x i1> + // O2: %[[RES:.*]] = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %[[PTR]], i32 16, <8 x i1> %[[MASK]], <8 x double> %[[SHUF2]]) + // O2: shufflevector <8 x double> %[[RES]], <8 x double> undef, <2 x i32> + return _mm_mask_load_sd (__A, __U, __W); +} + +__m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W) +{ + // O2-LABEL: @test_mm_maskz_load_sd + // O2: %[[PTR:.*]] = bitcast double* %__W to <8 x double>* + // O2: %[[AND:.*]] = and i8 %__U, 1 + // O2: %[[MASK:.*]] = bitcast i8 %[[AND]] to <8 x i1> + // O2: %[[RES:.*]] = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %[[PTR]], i32 16, <8 x i1> %[[MASK]], <8 x double> zeroinitializer) + // O2: shufflevector <8 x double> %[[RES]], <8 x double> undef, <2 x i32> + return _mm_maskz_load_sd (__U, __W); +} + __m512d test_mm512_abs_pd(__m512d a){ // CHECK-LABEL: @test_mm512_abs_pd // CHECK: and <8 x i64>