diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2020,16 +2020,22 @@ TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12508,6 +12508,7 @@ SmallVector Ops; bool IsMaskFCmp = false; + bool IsConjFMA = false; // Find out if any arguments are required to be integer constant expressions. unsigned ICEArguments = 0; @@ -15046,6 +15047,36 @@ Builder.SetInsertPoint(End); return Builder.CreateExtractValue(Call, 0); } + case X86::BI__builtin_ia32_vfcmaddcph512_mask: + IsConjFMA = true; + LLVM_FALLTHROUGH; + case X86::BI__builtin_ia32_vfmaddcph512_mask: { + Intrinsic::ID IID = IsConjFMA + ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512 + : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512; + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); + return EmitX86Select(*this, Ops[3], Call, Ops[0]); + } + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask: + IsConjFMA = true; + LLVM_FALLTHROUGH; + case X86::BI__builtin_ia32_vfmaddcsh_round_mask: { + Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh + : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); + Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1)); + return EmitX86Select(*this, And, Call, Ops[0]); + } + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: + IsConjFMA = true; + LLVM_FALLTHROUGH; + case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: { + Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh + : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); + static constexpr int Mask[] = {0, 5, 6, 7}; + return Builder.CreateShuffleVector(Call, Ops[2], Mask); + } } } diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -2941,11 +2941,8 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, - __builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), - (__v4sf)__A); + return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask( + (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 @@ -2957,10 +2954,8 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)_mm_move_ss((__m128)__C, - (__m128)__builtin_ia32_vfcmaddcsh_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, - _MM_FROUND_CUR_DIRECTION)); + return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_fcmadd_round_sch(A, B, C, R) \ @@ -2969,12 +2964,9 @@ (__mmask8)-1, (int)(R))) #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ - ((__m128h)__builtin_ia32_selectps_128( \ - (__mmask8)(U & 1), \ - __builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R)), \ - (__v4sf)(__m128h)(A))) + ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ @@ -2982,9 +2974,9 @@ (__mmask8)(U), (int)(R))) #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ - ((__m128h)_mm_move_ss((__m128)(C), \ - (__m128)__builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(A), (__v4sf)(B), (__v4sf)(C), (U), (R)))) + ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, __m128h __B, @@ -2996,11 +2988,8 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, - __builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), - (__v4sf)__A); + return (__m128h)__builtin_ia32_vfmaddcsh_round_mask( + (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 @@ -3010,24 +2999,32 @@ _MM_FROUND_CUR_DIRECTION); } +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); +} + #define _mm_fmadd_round_sch(A, B, C, R) \ ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)-1, (int)(R))) #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ - ((__m128h)__builtin_ia32_selectps_128( \ - (__mmask8)(U & 1), \ - __builtin_ia32_vfmaddcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R)), \ - (__v4sf)(__m128h)(A))) + ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)(U), (int)(R))) +#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \ + ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, __m128h __B) { return (__m128h)__builtin_ia32_vfcmulcsh_mask( @@ -3177,24 +3174,21 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); + return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_selectps_512( - __U, - __builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION), - (__v16sf)__A); + return (__m512h)__builtin_ia32_vfcmaddcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask( + return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } @@ -3207,20 +3201,17 @@ } #define _mm512_fcmadd_round_pch(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ + ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)-1, (int)(R))) #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_selectps_512( \ - (__mmask16)(U), \ - __builtin_ia32_vfcmaddcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R)), \ - (__v16sf)(__m512h)(A))) + ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ + ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) @@ -3232,26 +3223,23 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); + return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_selectps_512( - __U, - __builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, - (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION), - (__v16sf)__A); + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); + return (__m512h)__builtin_ia32_vfmaddcph512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 @@ -3262,20 +3250,17 @@ } #define _mm512_fmadd_round_pch(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ + ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)-1, (int)(R))) #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_selectps_512( \ - (__mmask16)(U), \ - __builtin_ia32_vfmaddcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R)), \ - (__v16sf)(__m512h)(A))) + ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ + ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4129,11 +4129,17 @@ case X86::BI__builtin_ia32_vfmaddsubph512_mask3: case X86::BI__builtin_ia32_vfmsubaddph512_mask3: case X86::BI__builtin_ia32_vfmaddcsh_mask: + case X86::BI__builtin_ia32_vfmaddcsh_round_mask: + case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: case X86::BI__builtin_ia32_vfmaddcph512_mask: case X86::BI__builtin_ia32_vfmaddcph512_maskz: + case X86::BI__builtin_ia32_vfmaddcph512_mask3: case X86::BI__builtin_ia32_vfcmaddcsh_mask: + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask: + case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: case X86::BI__builtin_ia32_vfcmaddcph512_mask: case X86::BI__builtin_ia32_vfcmaddcph512_maskz: + case X86::BI__builtin_ia32_vfcmaddcph512_mask3: case X86::BI__builtin_ia32_vfmulcsh_mask: case X86::BI__builtin_ia32_vfmulcph512_mask: case X86::BI__builtin_ia32_vfcmulcsh_mask: diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -4086,10 +4086,8 @@ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> - // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4) - // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_mask3_fcmadd_sch(__A, __B, __C, __U); } @@ -4119,10 +4117,8 @@ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> - // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11) - // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_mask3_fcmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -4147,6 +4143,17 @@ return _mm_maskz_fmadd_sch(__U, __A, __B, __C); } +__m128h test_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_sch + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4) + // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_mask3_fmadd_sch(__A, __B, __C, __U); +} + __m128h test_mm_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) { // CHECK-LABEL: @test_mm_fmadd_round_sch // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh @@ -4167,6 +4174,17 @@ return _mm_maskz_fmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } +__m128h test_mm_mask3_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + // CHECK-LABEL: @test_mm_mask3_fmadd_round_sch + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11) + // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> + // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_mask3_fmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + __m128h test_mm_fcmul_sch(__m128h __A, __m128h __B) { // CHECK-LABEL: @test_mm_fcmul_sch // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5754,7 +5754,7 @@ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>; def int_x86_avx512fp16_mask_vfcmadd_cph_512 - : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask">, + : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask3">, Intrinsic<[ llvm_v16f32_ty ], [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ], @@ -5786,7 +5786,7 @@ [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>; def int_x86_avx512fp16_mask_vfmadd_cph_512 - : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask">, + : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask3">, Intrinsic<[ llvm_v16f32_ty ], [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ],