Index: cfe/trunk/include/clang/Basic/BuiltinsX86.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsX86.def +++ cfe/trunk/include/clang/Basic/BuiltinsX86.def @@ -731,10 +731,22 @@ TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "nc", "fma|fma4") TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "nc", "fma|fma4") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd512, "V8dV8dV8dV8dIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddps512, "V16fV16fV16fV16fIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512, "V8dV8dV8dV8dIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512, "V16fV16fV16fV16fIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") // XOP TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "nc", "xop") Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -8555,79 +8555,110 @@ // Lowers X86 FMA intrinsics to IR. static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef Ops, - unsigned BuiltinID) { + unsigned BuiltinID, bool IsAddSub) { - bool IsAddSub = false; - bool IsScalar = false; - - // 4 operands always means rounding mode without a mask here. - bool IsRound = Ops.size() == 4; - - Intrinsic::ID ID; + bool Subtract = false; + Intrinsic::ID IID = Intrinsic::not_intrinsic; switch (BuiltinID) { default: break; - case clang::X86::BI__builtin_ia32_vfmaddss3: IsScalar = true; break; - case clang::X86::BI__builtin_ia32_vfmaddsd3: IsScalar = true; break; - case clang::X86::BI__builtin_ia32_vfmaddps512: - ID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break; - case clang::X86::BI__builtin_ia32_vfmaddpd512: - ID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break; - case clang::X86::BI__builtin_ia32_vfmaddsubps: IsAddSub = true; break; - case clang::X86::BI__builtin_ia32_vfmaddsubpd: IsAddSub = true; break; - case clang::X86::BI__builtin_ia32_vfmaddsubps256: IsAddSub = true; break; - case clang::X86::BI__builtin_ia32_vfmaddsubpd256: IsAddSub = true; break; - case clang::X86::BI__builtin_ia32_vfmaddsubps512: { - ID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512; - IsAddSub = true; + case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break; + case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break; + case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512; break; - } - case clang::X86::BI__builtin_ia32_vfmaddsubpd512: { - ID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512; - IsAddSub = true; + case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512; break; } - } - - // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). - if (IsRound) { - Function *Intr = CGF.CGM.getIntrinsic(ID); - if (cast(Ops[3])->getZExtValue() != (uint64_t)4) - return CGF.Builder.CreateCall(Intr, Ops); - } Value *A = Ops[0]; Value *B = Ops[1]; Value *C = Ops[2]; - if (IsScalar) { - A = CGF.Builder.CreateExtractElement(A, (uint64_t)0); - B = CGF.Builder.CreateExtractElement(B, (uint64_t)0); - C = CGF.Builder.CreateExtractElement(C, (uint64_t)0); - } - - llvm::Type *Ty = A->getType(); - Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); - Value *Res = CGF.Builder.CreateCall(FMA, {A, B, C} ); - - if (IsScalar) - return CGF.Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0); - - if (IsAddSub) { - // Negate even elts in C using a mask. - unsigned NumElts = Ty->getVectorNumElements(); - SmallVector NMask; - Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0); - Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1); - for (unsigned i = 0; i < NumElts; ++i) { - NMask.push_back(i % 2 == 0 ? One : Zero); - } - Value *NegMask = ConstantVector::get(NMask); - - Value *NegC = CGF.Builder.CreateFNeg(C); - Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} ); - Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res); + if (Subtract) + C = CGF.Builder.CreateFNeg(C); + + Value *Res; + + // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). + if (IID != Intrinsic::not_intrinsic && + cast(Ops.back())->getZExtValue() != (uint64_t)4) { + Function *Intr = CGF.CGM.getIntrinsic(IID); + Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() }); + } else { + llvm::Type *Ty = A->getType(); + Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); + Res = CGF.Builder.CreateCall(FMA, {A, B, C} ); + + if (IsAddSub) { + // Negate even elts in C using a mask. + unsigned NumElts = Ty->getVectorNumElements(); + SmallVector NMask; + Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0); + Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1); + for (unsigned i = 0; i < NumElts; ++i) { + NMask.push_back(i % 2 == 0 ? One : Zero); + } + Value *NegMask = ConstantVector::get(NMask); + + Value *NegC = CGF.Builder.CreateFNeg(C); + Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} ); + Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res); + } } + // Handle any required masking. + Value *MaskFalseVal = nullptr; + switch (BuiltinID) { + case clang::X86::BI__builtin_ia32_vfmaddps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: + MaskFalseVal = Ops[0]; + break; + case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + MaskFalseVal = Constant::getNullValue(Ops[0]->getType()); + break; + case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + MaskFalseVal = Ops[2]; + break; + } + + if (MaskFalseVal) + return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal); + return Res; } @@ -9046,20 +9077,40 @@ return EmitX86ConvertToMask(*this, Ops[0]); case X86::BI__builtin_ia32_vfmaddss3: - case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddsd3: { + Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); + Value *B = Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Value *C = Builder.CreateExtractElement(Ops[2], (uint64_t)0); + Function *FMA = CGM.getIntrinsic(Intrinsic::fma, A->getType()); + Value *Res = Builder.CreateCall(FMA, {A, B, C} ); + return Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0); + } case X86::BI__builtin_ia32_vfmaddps: case X86::BI__builtin_ia32_vfmaddpd: case X86::BI__builtin_ia32_vfmaddps256: case X86::BI__builtin_ia32_vfmaddpd256: - case X86::BI__builtin_ia32_vfmaddps512: - case X86::BI__builtin_ia32_vfmaddpd512: + case X86::BI__builtin_ia32_vfmaddps512_mask: + case X86::BI__builtin_ia32_vfmaddps512_maskz: + case X86::BI__builtin_ia32_vfmaddps512_mask3: + case X86::BI__builtin_ia32_vfmsubps512_mask3: + case X86::BI__builtin_ia32_vfmaddpd512_mask: + case X86::BI__builtin_ia32_vfmaddpd512_maskz: + case X86::BI__builtin_ia32_vfmaddpd512_mask3: + case X86::BI__builtin_ia32_vfmsubpd512_mask3: + return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false); case X86::BI__builtin_ia32_vfmaddsubps: case X86::BI__builtin_ia32_vfmaddsubpd: case X86::BI__builtin_ia32_vfmaddsubps256: case X86::BI__builtin_ia32_vfmaddsubpd256: - case X86::BI__builtin_ia32_vfmaddsubps512: - case X86::BI__builtin_ia32_vfmaddsubpd512: - return EmitX86FMAExpr(*this, Ops, BuiltinID); + case X86::BI__builtin_ia32_vfmaddsubps512_mask: + case X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/true); case X86::BI__builtin_ia32_movdqa32store128_mask: case X86::BI__builtin_ia32_movdqa64store128_mask: Index: cfe/trunk/lib/Headers/avx512fintrin.h =================================================================== --- cfe/trunk/lib/Headers/avx512fintrin.h +++ cfe/trunk/lib/Headers/avx512fintrin.h @@ -2578,910 +2578,818 @@ (__mmask8)-1, (int)(R)) #define _mm512_fmadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (int)(R)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_fmsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_fnmadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (int)(R)) + (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_fnmsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)) + (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_fmadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (int)(R)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_fmsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_fnmadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (int)(R)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_fnmsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_fmaddsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)) + (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_fmsubadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)) + (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)_mm512_setzero_pd()) + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) _mm512_setzero_pd()); + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_fmaddsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)) + (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_fmsubadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)) + (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)_mm512_setzero_ps()) + (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) _mm512_setzero_ps()); + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - (__m512d)__builtin_ia32_vfmaddpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - (__m512)__builtin_ia32_vfmaddps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - (__m512d)__builtin_ia32_vfmaddsubpd512 ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - (__m512)__builtin_ia32_vfmaddsubps512 ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, - __builtin_ia32_vfmaddpd512 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - __builtin_ia32_vfmaddps512 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(A)) + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (int)(R)), \ - (__v8df)(__m512d)(C)) + (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U, - (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, - -(__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __A); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U, - (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, - -(__v8df) __B, - -(__v8df) __C, - _MM_FROUND_CUR_DIRECTION), - (__v8df) __C); + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(A)) + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (int)(R)), \ - (__v16sf)(__m512)(C)) + (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, - -(__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __A); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, - (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, - -(__v16sf) __B, - -(__v16sf) __C, - _MM_FROUND_CUR_DIRECTION), - (__v16sf) __C); + return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } Index: cfe/trunk/lib/Sema/SemaChecking.cpp =================================================================== --- cfe/trunk/lib/Sema/SemaChecking.cpp +++ cfe/trunk/lib/Sema/SemaChecking.cpp @@ -2340,10 +2340,6 @@ case X86::BI__builtin_ia32_cvtuqq2ps512_mask: case X86::BI__builtin_ia32_sqrtpd512_mask: case X86::BI__builtin_ia32_sqrtps512_mask: - case X86::BI__builtin_ia32_vfmaddpd512: - case X86::BI__builtin_ia32_vfmaddps512: - case X86::BI__builtin_ia32_vfmaddsubpd512: - case X86::BI__builtin_ia32_vfmaddsubps512: ArgNum = 3; HasRC = true; break; @@ -2378,6 +2374,22 @@ case X86::BI__builtin_ia32_vfmaddss3_mask: case X86::BI__builtin_ia32_vfmaddss3_maskz: case X86::BI__builtin_ia32_vfmaddss3_mask3: + case X86::BI__builtin_ia32_vfmaddpd512_mask: + case X86::BI__builtin_ia32_vfmaddpd512_maskz: + case X86::BI__builtin_ia32_vfmaddpd512_mask3: + case X86::BI__builtin_ia32_vfmsubpd512_mask3: + case X86::BI__builtin_ia32_vfmaddps512_mask: + case X86::BI__builtin_ia32_vfmaddps512_maskz: + case X86::BI__builtin_ia32_vfmaddps512_mask3: + case X86::BI__builtin_ia32_vfmsubps512_mask3: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + case X86::BI__builtin_ia32_vfmaddsubps512_mask: + case X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case X86::BI__builtin_ia32_vfmsubaddps512_mask3: ArgNum = 4; HasRC = true; break; Index: cfe/trunk/test/CodeGen/avx512f-builtins.c =================================================================== --- cfe/trunk/test/CodeGen/avx512f-builtins.c +++ cfe/trunk/test/CodeGen/avx512f-builtins.c @@ -461,7 +461,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd // CHECK: @llvm.x86.avx512.vfmadd.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -483,7 +483,7 @@ // CHECK: fsub <8 x double> // CHECK: @llvm.x86.avx512.vfmadd.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -505,7 +505,7 @@ // CHECK: fsub <8 x double> // CHECK: @llvm.x86.avx512.vfmadd.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -521,7 +521,7 @@ // CHECK: fsub <8 x double> // CHECK: @llvm.x86.avx512.vfmadd.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -547,7 +547,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmadd_pd // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmadd_pd(__U, __A, __B, __C); } __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -569,7 +569,7 @@ // CHECK: fsub <8 x double> , %{{.*}} // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmsub_pd(__U, __A, __B, __C); } __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -591,7 +591,7 @@ // CHECK: fsub <8 x double> , %{{.*}} // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C); } __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -607,7 +607,7 @@ // CHECK: fsub <8 x double> , %{{.*}} // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C); } __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -633,7 +633,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps // CHECK: @llvm.x86.avx512.vfmadd.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -655,7 +655,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: @llvm.x86.avx512.vfmadd.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -677,7 +677,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: @llvm.x86.avx512.vfmadd.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -693,7 +693,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: @llvm.x86.avx512.vfmadd.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -717,7 +717,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmadd_ps // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmadd_ps(__U, __A, __B, __C); } __m512 test_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -739,7 +739,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmsub_ps(__U, __A, __B, __C); } __m512 test_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -761,7 +761,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fnmadd_ps(__U, __A, __B, __C); } __m512 test_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -777,7 +777,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fnmsub_ps(__U, __A, __B, __C); } __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -803,7 +803,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -825,7 +825,7 @@ // CHECK: fsub <8 x double> , %{{.*}} // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -863,7 +863,7 @@ // check: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // check: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m512d test_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { @@ -891,7 +891,7 @@ // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} // CHECK: bitcast i8 %{{.*}} to <8 x i1> - // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer return _mm512_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -917,7 +917,7 @@ // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -939,7 +939,7 @@ // CHECK: fsub <16 x float> , %{{.*}} // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -977,7 +977,7 @@ // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m512 test_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { @@ -1005,7 +1005,7 @@ // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: bitcast i16 %{{.*}} to <16 x i1> - // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer return _mm512_maskz_fmsubadd_ps(__U, __A, __B, __C); } __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {