Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -201,6 +201,7 @@ /// Combined add and sub on an FP vector. ADDSUB, + SUBADD, // FP vector ops with rounding mode. FADD_RND, FADDS_RND, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7427,6 +7427,105 @@ return true; } +/// Returns true iff \p BV builds a vector with the result equivalent to +/// the result of SUBADD operation. +/// If true is returned then the operands of SUBADD = Opnd0 -+ Opnd1 operation +/// are written to the parameters \p Opnd0 and \p Opnd1. +static bool isSubAdd(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1) { + + MVT VT = BV->getSimpleValueType(0); + if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FADD; + unsigned NextExpectedOpcode = ISD::FSUB; + bool SubFound = false; + bool AddFound = false; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return false; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Op0.getOperand(1)) || + !isa(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return false; + + unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return false; + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + SubFound = true; + else + AddFound = true; + + // Update InVec0 and InVec1. + if (InVec0.isUndef()) { + InVec0 = Op0.getOperand(0); + if (InVec0.getSimpleValueType() != VT) + return false; + } + if (InVec1.isUndef()) { + InVec1 = Op1.getOperand(0); + if (InVec1.getSimpleValueType() != VT) + return false; + } + + // Make sure that operands in input to each sub/add node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FADD) + return false; + } + + if (InVec1 != Op1.getOperand(0)) + return false; + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. + if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef()) + return false; + + Opnd0 = InVec0; + Opnd1 = InVec1; + return true; +} + /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). /// If (and only if) true is returned, the operands of FMADDSUB are written to @@ -7504,6 +7603,34 @@ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } +/// Try to fold a build_vector that performs an 'subadd' or 'fmsubadd' operation +/// accordingly to X86ISD::SUBADD or X86ISD::FMSUBADD node. +static SDValue lowerToSubAddOrFMSubAdd(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1)) + return SDValue(); + + MVT VT = BV->getSimpleValueType(0); + SDLoc DL(BV); + + // Try to generate X86ISD::FMSUBADD node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::SUBADD node for 512-bit types even though + // the SUBADD idiom has been successfully recognized. There are no known + // X86 targets with 512-bit SUBADD instructions! + // 512-bit SUBADD idiom recognition was needed only as part of FMSUBADD idiom + // recognition. + if (VT.is512BitVector()) + return SDValue(); + + return DAG.getNode(X86ISD::SUBADD, DL, VT, Opnd0, Opnd1); +} + /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, @@ -7837,6 +7964,8 @@ BuildVectorSDNode *BV = cast(Op.getNode()); if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; + if (SDValue SubAdd = lowerToSubAddOrFMSubAdd(BV, Subtarget, DAG)) + return SubAdd; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) @@ -29567,6 +29696,69 @@ return true; } +/// Returns true iff the shuffle node \p N can be replaced with SUBADD +/// operation. If true is returned then the operands of SUBADD operation +/// are written to the parameters \p Opnd0 and \p Opnd1. +/// +/// We combine shuffle to SUBADD directly on the abstract vector shuffle nodes +/// so it is easier to generically match. We also insert dummy vector shuffle +/// nodes for the operands which explicitly discard the lanes which are unused +/// by this operation to try to flow through the rest of the combiner +/// the fact that they're unused. +static bool isSubAdd(SDNode *N, const X86Subtarget &Subtarget, + SDValue &Opnd0, SDValue &Opnd1) { + + EVT VT = N->getValueType(0); + if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; + + // We only handle target-independent shuffles. + // FIXME: It would be easy and harmless to use the target shuffle mask + // extraction tool to support more. + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + ArrayRef OrigMask = cast(N)->getMask(); + SmallVector Mask(OrigMask.begin(), OrigMask.end()); + + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + // We require the first shuffle operand to be the FADD node, and the second to + // be the FSUB node. + if (V1.getOpcode() == ISD::FSUB && V2.getOpcode() == ISD::FADD) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FADD || V2.getOpcode() != ISD::FSUB) + return false; + + // If there are other uses of these operations we can't fold them. + if (!V1->hasOneUse() || !V2->hasOneUse()) + return false; + + // Ensure that both operations have the same operands. Note that we can + // commute the FADD operands. + SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); + if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && + (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) + return false; + + // We're looking for blends between FADD and FSUB nodes. We insist on these + // nodes being lined up in a specific expected pattern. + if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) || + isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23, + 8, 25, 10, 27, 12, 29, 14, 31}))) + return false; + + Opnd0 = LHS; + Opnd1 = RHS; + return true; +} + /// \brief Try to combine a shuffle into a target-specific add-sub or /// mul-add-sub node. static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, @@ -29593,6 +29785,33 @@ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } +/// \brief Try to combine a shuffle into a target-specific sub-add or +/// mul-sub-add node. +static SDValue combineShuffleToSubAddOrFMSubAdd(SDNode *N, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isSubAdd(N, Subtarget, Opnd0, Opnd1)) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Try to generate X86ISD::FMSUBADD node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::SUBADD node for 512-bit types even though + // the SUBADD idiom has been successfully recognized. There are no known + // X86 targets with 512-bit SUBADD instructions! + if (VT.is512BitVector()) + return SDValue(); + + return DAG.getNode(X86ISD::SUBADD, DL, VT, Opnd0, Opnd1); +} + + // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. @@ -29679,10 +29898,13 @@ EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB node. + // nodes that we can fuse into an ADDSUB or SUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; + + if (SDValue SubAdd = combineShuffleToSubAddOrFMSubAdd(N, Subtarget, DAG)) + return SubAdd; if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; Index: test/CodeGen/X86/fmsubadd-combine.ll =================================================================== --- test/CodeGen/X86/fmsubadd-combine.ll +++ test/CodeGen/X86/fmsubadd-combine.ll @@ -8,26 +8,17 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { ; FMA3_256-LABEL: mul_subadd_pd128: ; FMA3_256: # BB#0: # %entry -; FMA3_256-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_256-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_256-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_256-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_256-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd128: ; FMA3_512: # BB#0: # %entry -; FMA3_512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_512-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_512-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_512-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd128: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA4-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA4-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %AB = fmul <2 x double> %A, %B @@ -40,18 +31,12 @@ define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { ; FMA3-LABEL: mul_subadd_ps128: ; FMA3: # BB#0: # %entry -; FMA3-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA3-NEXT: vsubps %xmm2, %xmm0, %xmm1 -; FMA3-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; FMA3-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA3-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps128: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vsubps %xmm2, %xmm0, %xmm1 -; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; FMA4-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %AB = fmul <4 x float> %A, %B @@ -64,18 +49,12 @@ define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { ; FMA3-LABEL: mul_subadd_pd256: ; FMA3: # BB#0: # %entry -; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; FMA3-NEXT: vsubpd %ymm2, %ymm0, %ymm1 -; FMA3-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; FMA3-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA3-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd256: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; FMA4-NEXT: vsubpd %ymm2, %ymm0, %ymm1 -; FMA4-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 ; FMA4-NEXT: retq entry: %AB = fmul <4 x double> %A, %B @@ -88,18 +67,12 @@ define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { ; FMA3-LABEL: mul_subadd_ps256: ; FMA3: # BB#0: # %entry -; FMA3-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; FMA3-NEXT: vsubps %ymm2, %ymm0, %ymm1 -; FMA3-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; FMA3-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA3-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps256: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; FMA4-NEXT: vsubps %ymm2, %ymm0, %ymm1 -; FMA4-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 ; FMA4-NEXT: retq entry: %AB = fmul <8 x float> %A, %B @@ -112,34 +85,19 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { ; FMA3_256-LABEL: mul_subadd_pd512: ; FMA3_256: # BB#0: # %entry -; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA3_256-NEXT: vfmsubadd213pd %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmsubadd213pd %ymm5, %ymm3, %ymm1 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd512: ; FMA3_512: # BB#0: # %entry -; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; FMA3_512-NEXT: vsubpd %zmm2, %zmm0, %zmm1 -; FMA3_512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 -; FMA3_512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7] +; FMA3_512-NEXT: vfmsubadd213pd %zmm2, %zmm1, %zmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd512: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2 -; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3 -; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA4-NEXT: vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: retq entry: %AB = fmul <8 x double> %A, %B @@ -152,35 +110,19 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { ; FMA3_256-LABEL: mul_subadd_ps512: ; FMA3_256: # BB#0: # %entry -; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA3_256-NEXT: vfmsubadd213ps %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmsubadd213ps %ymm5, %ymm3, %ymm1 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_ps512: ; FMA3_512: # BB#0: # %entry -; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm1 -; FMA3_512-NEXT: vaddps %zmm2, %zmm1, %zmm0 -; FMA3_512-NEXT: movw $-21846, %ax # imm = 0xAAAA -; FMA3_512-NEXT: kmovw %eax, %k1 -; FMA3_512-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} +; FMA3_512-NEXT: vfmsubadd213ps %zmm2, %zmm1, %zmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps512: ; FMA4: # BB#0: # %entry -; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2 -; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3 -; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1 -; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA4-NEXT: vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: retq entry: %AB = fmul <16 x float> %A, %B