Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -488,6 +488,13 @@ FMADDSUB_RND, FMSUBADD_RND, + // Scalar intrinsic FMA with rounding mode. + // Two versions, passthru bits on op1 or op3. + FMADDS1_RND, FMADDS3_RND, + FNMADDS1_RND, FNMADDS3_RND, + FMSUBS1_RND, FMSUBS3_RND, + FNMSUBS1_RND, FNMSUBS3_RND, + // Compress and expand. COMPRESS, EXPAND, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -23435,6 +23435,14 @@ case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; + case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; + case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; + case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; + case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; + case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; + case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; @@ -31709,14 +31717,17 @@ unsigned NewOpcode = 0; if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; - case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; + case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; + // We can't handle scalar intrinsic node here because it would only + // invert one element and not the whole vector. But we could try to handle + // a negation of the lower element only. } } if (NewOpcode) @@ -32250,15 +32261,6 @@ SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - auto isScalarMaskedNode = [&](SDValue &V) { - if (V.hasOneUse()) - return false; - for (auto User : V.getNode()->uses()) - if (User->getOpcode() == X86ISD::SELECTS && N->isOperandOf(User)) - return true; - return false; - }; - auto invertIfNegative = [](SDValue &V) { if (SDValue NegVal = isFNEG(V.getNode())) { V = NegVal; @@ -32267,10 +32269,11 @@ return false; }; - // Do not convert scalar masked operations. - bool NegA = !isScalarMaskedNode(A) && invertIfNegative(A); - bool NegB = !isScalarMaskedNode(B) && invertIfNegative(B); - bool NegC = !isScalarMaskedNode(C) && invertIfNegative(C); + // Do not convert the passthru input of scalar intrinsics. + // FIXME: We could allow negations of the lower element only. + bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegB = invertIfNegative(B); + bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); @@ -32281,16 +32284,35 @@ else NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + if (N->getOpcode() == X86ISD::FMADD_RND) { switch (NewOpcode) { - case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } - return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break; + } + } else { + assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) && + "Unexpected opcode!"); + return DAG.getNode(NewOpcode, dl, VT, A, B, C); } - return DAG.getNode(NewOpcode, dl, VT, A, B, C); + + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -33057,6 +33079,8 @@ case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD: case X86ISD::FMADD_RND: + case X86ISD::FMADDS1_RND: + case X86ISD::FMADDS3_RND: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -5580,14 +5580,16 @@ }// Constraints = "$src1 = $dst" multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ , - string SUFF> { + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { defm NAME#213#SUFF#Z: avx512_fma3s_common; defm NAME#231#SUFF#Z: avx512_fma3s_common; defm NAME#132#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{ + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all, - EVEX_CD8<32, CD8VT1>, VEX_LIG; + OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all, - EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1, + X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1, + X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, + X86FnmaddRnds1, X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, + X86FnmsubRnds1, X86FnmsubRnds3>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -466,6 +466,18 @@ def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; +// Scalar FMA intrinsics with passthru bits in operand 1. +def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>; +def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>; +def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>; +def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>; + +// Scalar FMA intrinsics with passthru bits in operand 3. +def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>; +def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>; +def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>; +def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>; + def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -1192,8 +1192,8 @@ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1326,8 +1326,8 @@ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1345,8 +1345,8 @@ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1365,8 +1365,8 @@ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1404,8 +1404,8 @@ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -4707,15 +4707,15 @@ ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4735,15 +4735,15 @@ ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4762,10 +4762,10 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %xmm1, %xmm3 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, %xmm3 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) @@ -4780,8 +4780,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) @@ -4797,13 +4796,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4825,13 +4824,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4853,13 +4852,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4881,13 +4880,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4909,13 +4908,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4937,13 +4936,13 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4988,8 +4987,7 @@ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## BB#0: ; CHECK-NEXT: kxorw %k0, %k0, %k1 -; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 Index: llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll +++ llvm/trunk/test/CodeGen/X86/fma-fneg-combine.ll @@ -126,8 +126,8 @@ define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test10: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2 @@ -188,7 +188,7 @@ ; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %sub.i = fsub <2 x double> , %a