Index: include/clang/Basic/BuiltinsX86.def =================================================================== --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -717,100 +717,13 @@ TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd, "V2dV2dV2dV2d", "nc", "fma|fma4") TARGET_BUILTIN(__builtin_ia32_vfmaddps256, "V8fV8fV8fV8f", "nc", "fma|fma4") TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "nc", "fma|fma4") -TARGET_BUILTIN(__builtin_ia32_vfnmaddps256, "V8fV8fV8fV8f", "nc", "fma|fma4") -TARGET_BUILTIN(__builtin_ia32_vfnmaddpd256, "V4dV4dV4dV4d", "nc", "fma|fma4") TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "nc", "fma|fma4") TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "nc", "fma|fma4") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_mask, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_mask3, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_maskz, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_mask, "V4dV4dV4dV4dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_mask3, "V4dV4dV4dV4dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_maskz, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmaddps128_mask, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddps128_mask3, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddps128_maskz, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddps256_mask, "V8fV8fV8fV8fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddps256_mask3, "V8fV8fV8fV8fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddps256_maskz, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_mask, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_mask3, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_maskz, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_mask, "V4dV4dV4dV4dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_mask3, "V4dV4dV4dV4dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_maskz, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_mask, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_mask3, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_maskz, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_mask, "V8fV8fV8fV8fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_mask3, "V8fV8fV8fV8fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_maskz, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmsubpd128_mask3, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_mask3, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmsubps128_mask3, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmsubps256_mask3, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd128_mask3, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_mask3, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfmsubaddps128_mask3, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_mask3, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfnmaddpd128_mask, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmaddpd256_mask, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfnmaddpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfnmaddps128_mask, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmaddps256_mask, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfnmaddps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd128_mask, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd128_mask3, "V2dV2dV2dV2dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd256_mask, "V4dV4dV4dV4dUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd256_mask3, "V4dV4dV4dV4dUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfnmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f") - -TARGET_BUILTIN(__builtin_ia32_vfnmsubps128_mask, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubps128_mask3, "V4fV4fV4fV4fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubps256_mask, "V8fV8fV8fV8fUc", "nc", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_vfnmsubps256_mask3, "V8fV8fV8fV8fUc", "nc", "avx512vl") - -TARGET_BUILTIN(__builtin_ia32_vfnmsubps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f") -TARGET_BUILTIN(__builtin_ia32_vfnmsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddpd512, "V8dV8dV8dV8dIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddps512, "V16fV16fV16fV16fIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512, "V8dV8dV8dV8dIi", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512, "V16fV16fV16fV16fIi", "nc", "avx512f") // XOP TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "nc", "xop") Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -8411,6 +8411,84 @@ return Res; } +// Lowers X86 FMA intrinsics to IR. +static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef Ops, + unsigned BuiltinID) { + + bool IsAddSub = false; + bool IsScalar = false; + + // 4 operands always means rounding mode without a mask here. + bool IsRound = Ops.size() == 4; + + Intrinsic::ID ID; + switch (BuiltinID) { + default: break; + case clang::X86::BI__builtin_ia32_vfmaddss3: IsScalar = true; break; + case clang::X86::BI__builtin_ia32_vfmaddsd3: IsScalar = true; break; + case clang::X86::BI__builtin_ia32_vfmaddps512: + ID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break; + case clang::X86::BI__builtin_ia32_vfmaddpd512: + ID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break; + case clang::X86::BI__builtin_ia32_vfmaddsubps: IsAddSub = true; break; + case clang::X86::BI__builtin_ia32_vfmaddsubpd: IsAddSub = true; break; + case clang::X86::BI__builtin_ia32_vfmaddsubps256: IsAddSub = true; break; + case clang::X86::BI__builtin_ia32_vfmaddsubpd256: IsAddSub = true; break; + case clang::X86::BI__builtin_ia32_vfmaddsubps512: { + ID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512; + IsAddSub = true; + break; + } + case clang::X86::BI__builtin_ia32_vfmaddsubpd512: { + ID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512; + IsAddSub = true; + break; + } + } + + // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). + if (IsRound) { + Function *Intr = CGF.CGM.getIntrinsic(ID); + if (cast(Ops[3])->getZExtValue() != (uint64_t)4) + return CGF.Builder.CreateCall(Intr, Ops); + } + + Value *A = Ops[0]; + Value *B = Ops[1]; + Value *C = Ops[2]; + + if (IsScalar) { + A = CGF.Builder.CreateExtractElement(A, (uint64_t)0); + B = CGF.Builder.CreateExtractElement(B, (uint64_t)0); + C = CGF.Builder.CreateExtractElement(C, (uint64_t)0); + } + + llvm::Type *Ty = A->getType(); + Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); + Value *Res = CGF.Builder.CreateCall(FMA, {A, B, C} ); + + if (IsScalar) + return CGF.Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0); + + if (IsAddSub) { + // Negate even elts in C using a mask. + unsigned NumElts = Ty->getVectorNumElements(); + SmallVector NMask; + Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0); + Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1); + for (unsigned i = 0; i < NumElts; ++i) { + NMask.push_back(i % 2 == 0 ? One : Zero); + } + Value *NegMask = ConstantVector::get(NMask); + + Value *NegC = CGF.Builder.CreateFNeg(C); + Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} ); + Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res); + } + + return Res; +} + static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, ArrayRef Ops) { llvm::Type *Ty = Ops[0]->getType(); @@ -8804,6 +8882,22 @@ case X86::BI__builtin_ia32_cvtq2mask512: return EmitX86ConvertToMask(*this, Ops[0]); + case X86::BI__builtin_ia32_vfmaddss3: + case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddps: + case X86::BI__builtin_ia32_vfmaddpd: + case X86::BI__builtin_ia32_vfmaddps256: + case X86::BI__builtin_ia32_vfmaddpd256: + case X86::BI__builtin_ia32_vfmaddps512: + case X86::BI__builtin_ia32_vfmaddpd512: + case X86::BI__builtin_ia32_vfmaddsubps: + case X86::BI__builtin_ia32_vfmaddsubpd: + case X86::BI__builtin_ia32_vfmaddsubps256: + case X86::BI__builtin_ia32_vfmaddsubpd256: + case X86::BI__builtin_ia32_vfmaddsubps512: + case X86::BI__builtin_ia32_vfmaddsubpd512: + return EmitX86FMAExpr(*this, Ops, BuiltinID); + case X86::BI__builtin_ia32_movdqa32store128_mask: case X86::BI__builtin_ia32_movdqa64store128_mask: case X86::BI__builtin_ia32_storeaps128_mask: Index: lib/Headers/avx512fintrin.h =================================================================== --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -2577,819 +2577,910 @@ (__mmask8)-1, (int)(R)); }) #define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (__mmask8)-1, \ - (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), (int)(R)); }) #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)); }) #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (__mmask8)-1, \ - (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), (int)(R)); }) #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)); }) #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } #define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (__mmask16)-1, \ - (int)(R)); }) + (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), (int)(R)); }) #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)); }) #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (__mmask16)-1, \ - (int)(R)); }) + (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), (int)(R)); }) #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)); }) #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), \ + (__v16sf) __A); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), \ + (__v16sf) __C); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __C); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } #define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)); }) #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)); }) #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)_mm512_setzero_pd()); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) _mm512_setzero_pd()); } #define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)); }) #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)); }) #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)_mm512_setzero_ps()); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), \ + (__v16sf) __A); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), \ + (__v16sf) __C); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) _mm512_setzero_ps()); } #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + (__m512d)__builtin_ia32_vfmaddpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) - + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + (__m512)__builtin_ia32_vfmaddps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __C); } #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + (__m512d)__builtin_ia32_vfmaddsubpd512 ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + (__m512)__builtin_ia32_vfmaddsubps512 ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __C); } #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U, + __builtin_ia32_vfmaddpd512 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + __builtin_ia32_vfmaddps512 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A); } #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(A)); }) #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (int)(R)), \ + (__v8df)(__m512d)(C)); }) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U, + (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __A); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U, + (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + _MM_FROUND_CUR_DIRECTION), + (__v8df) __C); } #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(A)); }) #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (int)(R)), \ + (__v16sf)(__m512)(C)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __A); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_selectps_512((__mmask16) __U, + (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + _MM_FROUND_CUR_DIRECTION), + (__v16sf) __C); } @@ -8151,27 +8242,27 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)); }) + (__m128d)__builtin_ia32_vfmaddss3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\ @@ -8183,11 +8274,11 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + (__v4sf) __X, + (__v4sf) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\ @@ -8199,27 +8290,27 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\ @@ -8231,11 +8322,11 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + (__v4sf) __X, + -(__v4sf) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\ @@ -8247,11 +8338,11 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\ @@ -8263,43 +8354,43 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A, + -(__v4sf) __B, + (__v4sf) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ + (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(C), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + -(__v4sf) __X, + (__v4sf) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ + (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\ @@ -8311,43 +8402,43 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ + (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ -(__v4sf)(__m128)(C), (__mmask8)(U), \ _MM_FROUND_CUR_DIRECTION); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128 __Z = __builtin_ia32_vfmaddss3((__v4sf) __W, + -(__v4sf) __X, + -(__v4sf) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ + (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + (__v2df) __A, + (__v2df) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\ @@ -8359,11 +8450,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A, + (__v2df) __B, + (__v2df) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\ @@ -8375,11 +8466,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + (__v2df) __X, + (__v2df) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\ @@ -8391,11 +8482,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + (__v2df) __A, + -(__v2df) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\ @@ -8407,11 +8498,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A, + (__v2df) __B, + -(__v2df) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\ @@ -8423,11 +8514,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + (__v2df) __X, + -(__v2df) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\ @@ -8439,11 +8530,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + -(__v2df) __A, + (__v2df) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\ @@ -8455,43 +8546,43 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A, + -(__v2df) __B, + (__v2df) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ + (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(C), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + -(__v2df) __X, + (__v2df) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B); + __W[0] = (__U & 1) ? __Z[0] : __W[0]; + return __W; } #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\ @@ -8503,16 +8594,16 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C); + __A[0] = (__U & 1) ? __Z[0] : 0; + return __A; } #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ + (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ -(__v2df)(__m128d)(C), \ (__mmask8)(U), \ _MM_FROUND_CUR_DIRECTION); }) @@ -8520,16 +8611,16 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W), - (__v2df) __X, - (__v2df) (__Y), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __m128d __Z = __builtin_ia32_vfmaddsd3((__v2df) __W, + -(__v2df) __X, + -(__v2df) __Y); + __Y[0] = (__U & 1) ? __Z[0] : __Y[0]; + return __Y; } #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ + (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), \ (__mmask8)(U), (int)(R)); }) Index: lib/Headers/avx512vlintrin.h =================================================================== --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -813,658 +813,722 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + (__v2df) __C), + (__v2df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + (__v4df) __C), + (__v4df) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } static __inline__ __m128d __DEFAULT_FN_ATTRS Index: lib/Sema/SemaChecking.cpp =================================================================== --- lib/Sema/SemaChecking.cpp +++ lib/Sema/SemaChecking.cpp @@ -2340,6 +2340,10 @@ case X86::BI__builtin_ia32_cvtuqq2ps512_mask: case X86::BI__builtin_ia32_sqrtpd512_mask: case X86::BI__builtin_ia32_sqrtps512_mask: + case X86::BI__builtin_ia32_vfmaddpd512: + case X86::BI__builtin_ia32_vfmaddps512: + case X86::BI__builtin_ia32_vfmaddsubpd512: + case X86::BI__builtin_ia32_vfmaddsubps512: ArgNum = 3; HasRC = true; break; @@ -2368,28 +2372,6 @@ case X86::BI__builtin_ia32_cvtsd2ss_round_mask: case X86::BI__builtin_ia32_sqrtsd_round_mask: case X86::BI__builtin_ia32_sqrtss_round_mask: - case X86::BI__builtin_ia32_vfmaddpd512_mask: - case X86::BI__builtin_ia32_vfmaddpd512_mask3: - case X86::BI__builtin_ia32_vfmaddpd512_maskz: - case X86::BI__builtin_ia32_vfmaddps512_mask: - case X86::BI__builtin_ia32_vfmaddps512_mask3: - case X86::BI__builtin_ia32_vfmaddps512_maskz: - case X86::BI__builtin_ia32_vfmaddsubpd512_mask: - case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: - case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: - case X86::BI__builtin_ia32_vfmaddsubps512_mask: - case X86::BI__builtin_ia32_vfmaddsubps512_mask3: - case X86::BI__builtin_ia32_vfmaddsubps512_maskz: - case X86::BI__builtin_ia32_vfmsubpd512_mask3: - case X86::BI__builtin_ia32_vfmsubps512_mask3: - case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: - case X86::BI__builtin_ia32_vfmsubaddps512_mask3: - case X86::BI__builtin_ia32_vfnmaddpd512_mask: - case X86::BI__builtin_ia32_vfnmaddps512_mask: - case X86::BI__builtin_ia32_vfnmsubpd512_mask: - case X86::BI__builtin_ia32_vfnmsubpd512_mask3: - case X86::BI__builtin_ia32_vfnmsubps512_mask: - case X86::BI__builtin_ia32_vfnmsubps512_mask3: case X86::BI__builtin_ia32_vfmaddsd3_mask: case X86::BI__builtin_ia32_vfmaddsd3_maskz: case X86::BI__builtin_ia32_vfmaddsd3_mask3: Index: test/CodeGen/avx512f-builtins.c =================================================================== --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -439,483 +439,745 @@ __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 return _mm512_fmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512 + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 return _mm512_fmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmsub_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fnmadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 return _mm512_fnmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fnmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fnmsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 return _mm512_fnmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) return _mm512_fmadd_pd(__A, __B, __C); } __m512d test_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmadd_pd(__A, __U, __B, __C); } __m512d test_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmadd_pd(__A, __B, __C, __U); } __m512d test_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmadd_pd(__U, __A, __B, __C); } __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) return _mm512_fmsub_pd(__A, __B, __C); } __m512d test_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmsub_pd(__A, __U, __B, __C); } __m512d test_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmsub_pd(__U, __A, __B, __C); } __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) return _mm512_fnmadd_pd(__A, __B, __C); } __m512d test_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fnmadd_pd(__A, __B, __C, __U); } __m512d test_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fnmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C); } __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) return _mm512_fnmsub_pd(__A, __B, __C); } __m512d test_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fnmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C); } __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 return _mm512_fmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512 + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 return _mm512_fmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fnmadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 return _mm512_fnmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fnmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fnmsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 return _mm512_fnmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) return _mm512_fmadd_ps(__A, __B, __C); } __m512 test_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) return _mm512_mask_fmadd_ps(__A, __U, __B, __C); } __m512 test_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmadd_ps(__A, __B, __C, __U); } __m512 test_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmadd_ps(__U, __A, __B, __C); } __m512 test_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) return _mm512_fmsub_ps(__A, __B, __C); } __m512 test_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmsub_ps(__A, __U, __B, __C); } __m512 test_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmsub_ps(__U, __A, __B, __C); } __m512 test_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) return _mm512_fnmadd_ps(__A, __B, __C); } __m512 test_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fnmadd_ps(__A, __B, __C, __U); } __m512 test_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fnmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fnmadd_ps(__U, __A, __B, __C); } __m512 test_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) return _mm512_fnmsub_ps(__A, __B, __C); } __m512 test_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fnmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fnmsub_ps(__U, __A, __B, __C); } __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmaddsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 return _mm512_fmaddsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmaddsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmaddsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmsubadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 return _mm512_fmsubadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmsubadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_fmaddsub_pd(__A, __B, __C); } __m512d test_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmaddsub_pd(__A, __U, __B, __C); } __m512d test_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.512 + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m512d test_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmaddsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512 + // check: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // check: fsub <8 x double> , %{{.*}} + // check: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // check: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m512d test_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_fmsubadd_pd(__A, __B, __C); } __m512d test_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fmsubadd_pd(__A, __U, __B, __C); } __m512d test_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_maskz_fmsubadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmaddsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 return _mm512_fmaddsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmaddsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmaddsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512 + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmsubadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 return _mm512_fmsubadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmsubadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_fmaddsub_ps(__A, __B, __C); } __m512 test_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmaddsub_ps(__A, __U, __B, __C); } __m512 test_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmaddsub_ps(__A, __B, __C, __U); } __m512 test_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512 + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m512 test_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_fmsubadd_ps(__A, __B, __C); } __m512 test_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fmsubadd_ps(__A, __U, __B, __C); } __m512 test_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_fmsubadd_ps(__U, __A, __B, __C); } __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsub_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmsub_pd(__A, __B, __C, __U); } __m512 test_mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmsub_ps(__A, __B, __C, __U); } __m512d test_mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmsubadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: select <8 x i1> , <8 x double> %{{.*}}, <8 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m512 test_mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmsubadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: select <16 x i1> , <16 x float> %{{.*}}, <16 x float> %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fmsubadd_ps(__A, __B, __C, __U); } __m512d test_mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fnmadd_round_pd - // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.512 + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fnmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fnmadd_pd(__A, __U, __B, __C); } __m512 test_mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ps - // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fnmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fnmadd_ps(__A, __U, __B, __C); } __m512d test_mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fnmsub_round_pd - // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.512 + // CHECK: fsub <8 x double> + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fnmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_pd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.512 + // CHECK: fsub <8 x double> + // CHECK: fsub <8 x double> + // CHECK: @llvm.x86.avx512.vfmadd.pd.512 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fnmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512d test_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { // CHECK-LABEL: @test_mm512_mask_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_fnmsub_pd(__A, __U, __B, __C); } __m512d test_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.512 + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: fsub <8 x double> , %{{.*}} + // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask3_fnmsub_pd(__A, __B, __C, __U); } __m512 test_mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ps - // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fnmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ps - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: @llvm.x86.avx512.vfmadd.ps.512 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fnmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } __m512 test_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { // CHECK-LABEL: @test_mm512_mask_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_fnmsub_ps(__A, __U, __B, __C); } __m512 test_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { // CHECK-LABEL: @test_mm512_mask3_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.512 + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: fsub <16 x float> , %{{.*}} + // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask3_fnmsub_ps(__A, __B, __C, __U); } @@ -5856,7 +6118,24 @@ __m128 test_mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){ // CHECK-LABEL: @test_mm_mask_fmadd_ss - // CHECK: @llvm.x86.avx512.mask.vfmadd.ss + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask_fmadd_ss(__W, __U, __A, __B); } @@ -5868,7 +6147,23 @@ __m128 test_mm_maskz_fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){ // CHECK-LABEL: @test_mm_maskz_fmadd_ss - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_maskz_fmadd_ss(__U, __A, __B, __C); } @@ -5880,7 +6175,24 @@ __m128 test_mm_mask3_fmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fmadd_ss - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask3_fmadd_ss(__W, __X, __Y, __U); } @@ -5892,7 +6204,25 @@ __m128 test_mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){ // CHECK-LABEL: @test_mm_mask_fmsub_ss - // CHECK: @llvm.x86.avx512.mask.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask_fmsub_ss(__W, __U, __A, __B); } @@ -5904,7 +6234,24 @@ __m128 test_mm_maskz_fmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){ // CHECK-LABEL: @test_mm_maskz_fmsub_ss - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_maskz_fmsub_ss(__U, __A, __B, __C); } @@ -5916,7 +6263,25 @@ __m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fmsub_ss - // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask3_fmsub_ss(__W, __X, __Y, __U); } @@ -5928,7 +6293,25 @@ __m128 test_mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){ // CHECK-LABEL: @test_mm_mask_fnmadd_ss - // CHECK: @llvm.x86.avx512.mask.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask_fnmadd_ss(__W, __U, __A, __B); } @@ -5940,7 +6323,24 @@ __m128 test_mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){ // CHECK-LABEL: @test_mm_maskz_fnmadd_ss - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_maskz_fnmadd_ss(__U, __A, __B, __C); } @@ -5952,7 +6352,25 @@ __m128 test_mm_mask3_fnmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmadd_ss - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask3_fnmadd_ss(__W, __X, __Y, __U); } @@ -5964,7 +6382,26 @@ __m128 test_mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){ // CHECK-LABEL: @test_mm_mask_fnmsub_ss - // CHECK: @llvm.x86.avx512.mask.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask_fnmsub_ss(__W, __U, __A, __B); } @@ -5976,7 +6413,25 @@ __m128 test_mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){ // CHECK-LABEL: @test_mm_maskz_fnmsub_ss - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_maskz_fnmsub_ss(__U, __A, __B, __C); } @@ -5988,19 +6443,55 @@ __m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmsub_ss - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi float + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U); } __m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss + // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION); } __m128d test_mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){ // CHECK-LABEL: @test_mm_mask_fmadd_sd - // CHECK: @llvm.x86.avx512.mask.vfmadd.sd + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask_fmadd_sd(__W, __U, __A, __B); } @@ -6012,7 +6503,23 @@ __m128d test_mm_maskz_fmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){ // CHECK-LABEL: @test_mm_maskz_fmadd_sd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_maskz_fmadd_sd(__U, __A, __B, __C); } @@ -6024,7 +6531,24 @@ __m128d test_mm_mask3_fmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fmadd_sd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask3_fmadd_sd(__W, __X, __Y, __U); } @@ -6036,7 +6560,25 @@ __m128d test_mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){ // CHECK-LABEL: @test_mm_mask_fmsub_sd - // CHECK: @llvm.x86.avx512.mask.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask_fmsub_sd(__W, __U, __A, __B); } @@ -6048,7 +6590,24 @@ __m128d test_mm_maskz_fmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){ // CHECK-LABEL: @test_mm_maskz_fmsub_sd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_maskz_fmsub_sd(__U, __A, __B, __C); } @@ -6060,7 +6619,25 @@ __m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fmsub_sd - // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask3_fmsub_sd(__W, __X, __Y, __U); } @@ -6072,7 +6649,25 @@ __m128d test_mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){ // CHECK-LABEL: @test_mm_mask_fnmadd_sd - // CHECK: @llvm.x86.avx512.mask.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask_fnmadd_sd(__W, __U, __A, __B); } @@ -6084,7 +6679,24 @@ __m128d test_mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){ // CHECK-LABEL: @test_mm_maskz_fnmadd_sd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_maskz_fnmadd_sd(__U, __A, __B, __C); } @@ -6096,7 +6708,25 @@ __m128d test_mm_mask3_fnmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmadd_sd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask3_fnmadd_sd(__W, __X, __Y, __U); } @@ -6108,7 +6738,26 @@ __m128d test_mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){ // CHECK-LABEL: @test_mm_mask_fnmsub_sd - // CHECK: @llvm.x86.avx512.mask.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask_fnmsub_sd(__W, __U, __A, __B); } @@ -6120,7 +6769,25 @@ __m128d test_mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){ // CHECK-LABEL: @test_mm_maskz_fnmsub_sd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_maskz_fnmsub_sd(__U, __A, __B, __C); } @@ -6132,13 +6799,32 @@ __m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmsub_sd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: zext i8 %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 1 + // CHECK: icmp ne i32 %{{.*}}, 0 + // CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: br label %{{.*}} + + // CHECK: phi double + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U); } __m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){ // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd + // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION); } Index: test/CodeGen/avx512vl-builtins.c =================================================================== --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -1125,433 +1125,751 @@ __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmadd_pd(__A, __B, __C, __U); } __m128d test_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmadd_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmsub_pd(__U, __A, __B, __C); } __m128 test_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmadd_ps(__A, __B, __C, __U); } __m128 test_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmadd_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmsub_ps(__U, __A, __B, __C); } __m256 test_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fmadd_ps(__A, __U, __B, __C); } __m256 test_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fmsub_ps(__A, __U, __B, __C); } __m256 test_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fmadd_ps(__A, __B, __C, __U); } __m256 test_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fnmadd_ps(__A, __B, __C, __U); } __m256 test_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fmadd_ps(__U, __A, __B, __C); } __m256 test_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fmsub_ps(__U, __A, __B, __C); } __m256 test_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fnmadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fnmadd_ps(__U, __A, __B, __C); } __m256 test_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fnmsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fnmsub_ps(__U, __A, __B, __C); } __m128d test_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmaddsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsubadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.128 + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmaddsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsubadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmaddsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmaddsub_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsubadd_pd - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m128 test_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmaddsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsubadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmaddsub_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.128 + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsubadd_ps(__U, __A, __B, __C); } __m256 test_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fmaddsub_ps(__A, __U, __B, __C); } __m256 test_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fmsubadd_ps(__A, __U, __B, __C); } __m256 test_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmaddsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fmaddsub_ps(__A, __B, __C, __U); } __m256 test_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fmaddsub_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m256 test_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_maskz_fmsubadd_ps - // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_fmsubadd_ps(__U, __A, __B, __C); } __m128d test_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsub_ps(__A, __B, __C, __U); } __m256 test_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fmsub_ps(__A, __B, __C, __U); } __m128d test_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsubadd_pd - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsubadd_ps(__A, __B, __C, __U); } __m256 test_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsubadd_ps - // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fmsubadd_ps(__A, __B, __C, __U); } __m128d test_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmadd_pd - // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmadd_pd(__A, __U, __B, __C); } __m128 test_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmadd_ps(__A, __U, __B, __C); } __m256 test_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fnmadd_ps - // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fnmadd_ps(__A, __U, __B, __C); } __m128d test_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.128 + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: fsub <2 x double> , %{{.*}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> + // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmsub_pd - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.256 + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: fsub <4 x double> , %{{.*}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.128 + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: fsub <4 x float> , %{{.*}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmsub_ps(__A, __B, __C, __U); } __m256 test_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { // CHECK-LABEL: @test_mm256_mask_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_fnmsub_ps(__A, __U, __B, __C); } __m256 test_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmsub_ps - // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.256 + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask3_fnmsub_ps(__A, __B, __C, __U); } Index: test/CodeGen/fma-builtins.c =================================================================== --- test/CodeGen/fma-builtins.c +++ test/CodeGen/fma-builtins.c @@ -5,81 +5,105 @@ __m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmadd_ps - // CHECK: @llvm.x86.fma.vfmadd.ps + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fmadd_ps(a, b, c); } __m128d test_mm_fmadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmadd_pd - // CHECK: @llvm.x86.fma.vfmadd.pd + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fmadd_pd(a, b, c); } __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmadd_ss - // CHECK: @llvm.x86.fma.vfmadd.ss + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_fmadd_ss(a, b, c); } __m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmadd_sd - // CHECK: @llvm.x86.fma.vfmadd.sd + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_fmadd_sd(a, b, c); } __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsub_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fmsub_ps(a, b, c); } __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmsub_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fmsub_pd(a, b, c); } __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsub_ss // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]]) + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_fmsub_ss(a, b, c); } __m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmsub_sd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]]) + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_fmsub_sd(a, b, c); } __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmadd_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> %{{.+}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fnmadd_ps(a, b, c); } __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmadd_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> %{{.+}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fnmadd_pd(a, b, c); } __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmadd_ss // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> [[NEG]], <4 x float> %{{.+}}) + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_fnmadd_ss(a, b, c); } __m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmadd_sd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> [[NEG]], <2 x double> %{{.+}}) + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_fnmadd_sd(a, b, c); } @@ -87,7 +111,7 @@ // CHECK-LABEL: test_mm_fnmsub_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> [[NEG2]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fnmsub_ps(a, b, c); } @@ -95,7 +119,7 @@ // CHECK-LABEL: test_mm_fnmsub_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> [[NEG2]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fnmsub_pd(a, b, c); } @@ -103,7 +127,11 @@ // CHECK-LABEL: test_mm_fnmsub_ss // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> [[NEG]], <4 x float> [[NEG2]]) + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_fnmsub_ss(a, b, c); } @@ -111,73 +139,87 @@ // CHECK-LABEL: test_mm_fnmsub_sd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> [[NEG]], <2 x double> [[NEG2]]) + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_fnmsub_sd(a, b, c); } __m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmaddsub_ps - // CHECK: @llvm.x86.fma.vfmaddsub.ps + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_fmaddsub_ps(a, b, c); } __m128d test_mm_fmaddsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmaddsub_pd - // CHECK: @llvm.x86.fma.vfmaddsub.pd + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_fmaddsub_pd(a, b, c); } __m128 test_mm_fmsubadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsubadd_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_fmsubadd_ps(a, b, c); } __m128d test_mm_fmsubadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmsubadd_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_fmsubadd_pd(a, b, c); } __m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmadd_ps - // CHECK: @llvm.x86.fma.vfmadd.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fmadd_ps(a, b, c); } __m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmadd_pd - // CHECK: @llvm.x86.fma.vfmadd.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fmadd_pd(a, b, c); } __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmsub_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> %{{.+}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fmsub_ps(a, b, c); } __m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmsub_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fmsub_pd(a, b, c); } __m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fnmadd_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> %{{.+}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fnmadd_ps(a, b, c); } __m256d test_mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fnmadd_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> %{{.+}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fnmadd_pd(a, b, c); } @@ -185,7 +227,7 @@ // CHECK-LABEL: test_mm256_fnmsub_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} // CHECK: [[NEG2:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> [[NEG2]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fnmsub_ps(a, b, c); } @@ -193,32 +235,42 @@ // CHECK-LABEL: test_mm256_fnmsub_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> [[NEG2]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fnmsub_pd(a, b, c); } __m256 test_mm256_fmaddsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmaddsub_ps - // CHECK: @llvm.x86.fma.vfmaddsub.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_fmaddsub_ps(a, b, c); } __m256d test_mm256_fmaddsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmaddsub_pd - // CHECK: @llvm.x86.fma.vfmaddsub.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_fmaddsub_pd(a, b, c); } __m256 test_mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmsubadd_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_fmsubadd_ps(a, b, c); } __m256d test_mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmsubadd_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_fmsubadd_pd(a, b, c); } Index: test/CodeGen/fma4-builtins.c =================================================================== --- test/CodeGen/fma4-builtins.c +++ test/CodeGen/fma4-builtins.c @@ -5,13 +5,13 @@ __m128 test_mm_macc_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_macc_ps - // CHECK: @llvm.x86.fma.vfmadd.ps + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_macc_ps(a, b, c); } __m128d test_mm_macc_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_macc_pd - // CHECK: @llvm.x86.fma.vfmadd.pd + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_macc_pd(a, b, c); } @@ -30,14 +30,14 @@ __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_msub_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_msub_ps(a, b, c); } __m128d test_mm_msub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_msub_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_msub_pd(a, b, c); } @@ -58,14 +58,14 @@ __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmacc_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> %{{.+}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_nmacc_ps(a, b, c); } __m128d test_mm_nmacc_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_nmacc_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> %{{.+}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_nmacc_pd(a, b, c); } @@ -87,7 +87,7 @@ // CHECK-LABEL: test_mm_nmsub_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> [[NEG2]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_nmsub_ps(a, b, c); } @@ -95,7 +95,7 @@ // CHECK-LABEL: test_mm_nmsub_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> [[NEG2]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_nmsub_pd(a, b, c); } @@ -117,67 +117,77 @@ __m128 test_mm_maddsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_maddsub_ps - // CHECK: @llvm.x86.fma.vfmaddsub.ps + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maddsub_ps(a, b, c); } __m128d test_mm_maddsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_maddsub_pd - // CHECK: @llvm.x86.fma.vfmaddsub.pd + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maddsub_pd(a, b, c); } __m128 test_mm_msubadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_msubadd_ps // CHECK: [[NEG:%.+]] = fsub <4 x float> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]]) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: select <4 x i1> , <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_msubadd_ps(a, b, c); } __m128d test_mm_msubadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_msubadd_pd // CHECK: [[NEG:%.+]] = fsub <2 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]]) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: select <2 x i1> , <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_msubadd_pd(a, b, c); } __m256 test_mm256_macc_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_macc_ps - // CHECK: @llvm.x86.fma.vfmadd.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_macc_ps(a, b, c); } __m256d test_mm256_macc_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_macc_pd - // CHECK: @llvm.x86.fma.vfmadd.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_macc_pd(a, b, c); } __m256 test_mm256_msub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_msub_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> %{{.+}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_msub_ps(a, b, c); } __m256d test_mm256_msub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_msub_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_msub_pd(a, b, c); } __m256 test_mm256_nmacc_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_nmacc_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> %{{.+}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_nmacc_ps(a, b, c); } __m256d test_mm256_nmacc_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_nmacc_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> %{{.+}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_nmacc_pd(a, b, c); } @@ -185,7 +195,7 @@ // CHECK-LABEL: test_mm256_nmsub_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} // CHECK: [[NEG2:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> [[NEG2]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_nmsub_ps(a, b, c); } @@ -193,32 +203,42 @@ // CHECK-LABEL: test_mm256_nmsub_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} // CHECK: [[NEG2:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> [[NEG2]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_nmsub_pd(a, b, c); } __m256 test_mm256_maddsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_maddsub_ps - // CHECK: @llvm.x86.fma.vfmaddsub.ps.256 + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maddsub_ps(a, b, c); } __m256d test_mm256_maddsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_maddsub_pd - // CHECK: @llvm.x86.fma.vfmaddsub.pd.256 + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maddsub_pd(a, b, c); } __m256 test_mm256_msubadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_msubadd_ps // CHECK: [[NEG:%.+]] = fsub <8 x float> , %{{.*}} - // CHECK: @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.+}}, <8 x float> [[NEG]]) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) + // CHECK: select <8 x i1> , <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_msubadd_ps(a, b, c); } __m256d test_mm256_msubadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_msubadd_pd // CHECK: [[NEG:%.+]] = fsub <4 x double> , %{{.+}} - // CHECK: @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]]) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: select <4 x i1> , <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_msubadd_pd(a, b, c); }